Mercurial > repos > bgruening > bismark
diff bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/coverage2cytosine Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,1254 @@ +#!/usr/bin/env perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see <http://www.gnu.org/licenses/>. + +my %chromosomes; # storing sequence information of all chromosomes/scaffolds +my %processed; # keeping a record of which chromosomes have been processed +my $coverage2cytosine_version = 'v0.16.3'; + +my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline(); + +warn "Summary of parameters for genome-wide cytosine report:\n"; +warn '='x78,"\n"; +warn "Coverage infile:\t\t\t$coverage_infile\n"; +warn "Output directory:\t\t\t>$output_dir<\n"; +warn "Parent directory:\t\t\t>$parent_dir<\n"; +warn "Genome directory:\t\t\t>$genome_folder<\n"; + +if ($CX_context){ + warn "CX context:\t\t\t\tyes\n"; +} +else{ + warn "CX context:\t\t\t\tno (CpG context only, default)\n"; +} +if ($merge_CpGs){ + warn "Pooling CpG top/bottom strand evidence:\tyes\n"; +} +if($gc_context){ + warn "Optional GC context track:\t\tyes\n"; +} +if ($tetra){ + warn "Tetra/Penta nucleotide context:\t\tyes\n"; +} + +if ($zero){ + warn "Genome coordinates used:\t\t0-based (user specified)\n"; +} +else{ + warn "Genome coordinates used:\t\t1-based (default)\n"; +} + +if ($gzip){ + warn "GZIP compression:\t\t\tyes\n"; +} +else{ + warn "GZIP compression:\t\t\tno\n"; +} + +if ($split_by_chromosome){ + warn "Split by chromosome:\t\t\tyes\n\n\n"; +} +else{ + warn "Split by chromosome:\t\t\tno\n\n\n"; +} +sleep (3); + +read_genome_into_memory(); +warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n"; + +generate_genome_wide_cytosine_report($coverage_infile); + +### 11 December 2014 + +# The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand +# into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package +# which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing + +if ($merge_CpGs){ + # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being + combine_CpGs_to_single_CG_entity($cytosine_out); +} + +### 18 August 2015 + +# The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed +if ($gc_context){ + generate_GC_context_report($coverage_infile); +} + + +sub combine_CpGs_to_single_CG_entity{ + my $CpG_report_file = shift; + warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n"; + + open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n"; + my $pooled_CG = $CpG_report_file; + $pooled_CG =~ s/$/.merged_CpG_evidence.cov/; + open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n"; + warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n"; + sleep(1); + + while (1){ + my $line1 = <IN>; + my $line2 = <IN>; + last unless ($line1 and $line2); + + chomp $line1; + chomp $line2; + + my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); + my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); + # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; + # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1); + + if ($pos1 < 2){ + $line1 = $line2; + $line2 = <IN>; + chomp $line2; + ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); + ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); + } + + # a few sanity checks + die "The context of the first line was not CG:\t$line1" unless ($context1 eq 'CG'); + die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG'); + + unless ($strand1 eq '+' and $strand2 eq'-'){ + die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n"; + } + unless ($pos2 eq ($pos1 + 1)){ + die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n"; + } + unless($chr1 eq $chr2){ + die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n"; + } + + # This looks alright from what I can tell, lets pool the 2 strands + + my $pooled_m = $m1 + $m2; + my $pooled_u = $u1 + $u2; + + # since this is a new coverage file we only write it out if the coverage is at least 1 + next unless ($pooled_m + $pooled_u) > 0; + + my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100); + # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; + # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; + # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; + # sleep(1); + print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; + } + + warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n"; + +} + + +sub process_commandline{ + my $help; + my $output_dir; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $gzip; + + my $split_by_chromosome; + my $cytosine_out; + my $parent_dir; + my $version; + my $merge_CpGs; + my $gc_context; + my $tetra; + + my $command_line = GetOptions ('help|man' => \$help, + 'dir=s' => \$output_dir, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + 'o|output=s' => \$cytosine_out, + 'parent_dir=s' => \$parent_dir, + 'version' => \$version, + 'merge_CpGs' => \$merge_CpGs, + 'GC|GC_context' => \$gc_context, + 'ff|qq' => \$tetra, + 'gzip' => \$gzip, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($version){ + print << "VERSION"; + + + Bismark Methylation Extractor Module - + coverage2cytosine + + Bismark Extractor Version: $coverage2cytosine_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + ### no files provided + unless (@ARGV){ + warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n"; + sleep(2); + + print_helpfile(); + exit; + } + + my $coverage_infile = shift @ARGV; + + unless ($parent_dir){ + $parent_dir = getcwd(); + } + unless ($parent_dir =~ /\/$/){ + $parent_dir =~ s/$/\//; + } + + unless (defined $cytosine_out){ + die "Please provide the name of the output file using the option -o/--output filename\n"; + } + + ### OUTPUT DIR PATH + if (defined $output_dir){ + unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + } + else{ + $output_dir = ''; + } + + unless ($CX_context){ + $CX_context = 0; + $CpG_only = 1; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + if ($merge_CpGs){ + if ($CX_context){ + die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n"; + } + if ($split_by_chromosome){ + die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n"; + } + } + + return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra); +} + + + +sub generate_genome_wide_cytosine_report { + + warn "="x78,"\n"; + warn "Methylation information will now be written into a genome-wide cytosine report\n"; + warn "="x78,"\n\n"; + sleep (2); + + my $number_processed = 0; + + ### changing to the output directory again + unless ($output_dir eq ''){ # default + chdir $output_dir or die "Failed to change directory to $output_dir\n"; + # warn "Changed directory to $output_dir\n"; + } + + my $in = shift; + # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir + if ($in =~ /gz$/){ + open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16 + } + else{ + open (IN,"$in") or die "Failed to read from file $in: $!\n"; + } + + + ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands + unless ($split_by_chromosome){ ### writing all output to a single file (default) + if ($gzip){ + unless ($cytosine_out =~ /\.gz$/){ + $cytosine_out .= '.gz'; + } + open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n"; + } + else{ + open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n"; + } + warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n"; + sleep (1); + } + + my $last_chr; + my %chr; # storing reads for one chromosome at a time + + my $count = 0; + while (<IN>){ + chomp; + ++$count; + my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); + + # defining the first chromosome + unless (defined $last_chr){ + $last_chr = $chr; + ++$number_processed; + # warn "Storing all covered cytosine positions for chromosome: $chr\n"; + } + + ### As of version 0.9.1 the start positions are 1-based! + if ($chr eq $last_chr){ + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + else{ + warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + ++$number_processed; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /\.txt$/){ + $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; + } + else{ + $chromosome_out =~ s/$/.chr${last_chr}.txt/; + } + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + + if ($tetra){ + $tetra_nt = ''; # clearing + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$last_chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + if ($tetra){ + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + $strand = '+'; + } + elsif ($1 eq 'G'){ # C on reverse strand + + $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + $strand = '-'; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + + # if (length$penta_nt < 5){ + # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); + # } + + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1) + $meth = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; + } + + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n" + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + %chr = (); # resetting the hash + + # new first entry + $last_chr = $chr; + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + } + + # Last found chromosome + # If there never was a last chromosome then something must have gone wrong with reading the data in + unless (defined $last_chr){ + die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n"; + } + + warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + $processed{$last_chr} = 1; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt + $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; + } + else{ # user specified output file name + $chromosome_out =~ s/$/.chr${last_chr}.txt/; + } + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + while ( $chromosomes{$last_chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + + if ($tetra){ + $tetra_nt = ''; + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$last_chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + $strand = '+'; + + if ($tetra){ + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + + } + elsif ($1 eq 'G'){ # C on reverse strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + $strand = '-'; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + } + + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1 + $meth = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + # if (length$penta_nt < 5){ + # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); + # } + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero-based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; + + ### Now processing chromosomes that were not covered in the coverage file + warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n"; + my $unprocessed = 0; + + foreach my $chr (sort keys %processed) { + unless ( $processed{$chr} ) { + ++$unprocessed; + ++$number_processed; + process_unprocessed_chromosomes($chr); + } + } + + if ($unprocessed == 0) { + warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n"; + } + else{ + warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n"; + } + + close CYT or warn $!; + +} + + +#### GC CONTEXT - optional +#### + +sub generate_GC_context_report { + + warn "="x82,"\n"; + warn "Methylation information for GC context will now be written to a GpC-context report\n"; + warn "="x82,"\n\n"; + sleep (2); + + my $number_processed = 0; + + ### changing to the output directory again + unless ($output_dir eq ''){ # default + chdir $output_dir or die "Failed to change directory to $output_dir\n"; + # warn "Changed directory to $output_dir\n"; + } + + my $in = shift; + if ($in =~ /gz$/){ + open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; + } + else{ + open (IN,"$in") or die "Failed to read from file $in: $!\n"; + } + + my $gc_out = $cytosine_out.'.GpC_report.txt'; + my $gc_cov = $cytosine_out.'.GpC.cov'; + + ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands + open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n"; + warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n"; + open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n"; + warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n"; + + my $last_chr; + my %chr; # storing reads for one chromosome at a time + + my $count = 0; + while (<IN>){ + chomp; + ++$count; + my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); + + # defining the first chromosome + unless (defined $last_chr){ + $last_chr = $chr; + ++$number_processed; + # warn "Storing all covered cytosine positions for chromosome: $chr\n"; + } + + ### start positions are 1-based + if ($chr eq $last_chr){ + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + else{ + warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + ++$number_processed; + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /(GC)/g){ + + # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time + my $context_top = ''; + my $context_bottom = ''; + my $pos = pos$chromosomes{$last_chr}; + + my $meth_top = 0; + my $meth_bottom = 0; + my $nonmeth_top = 0; + my $nonmeth_bottom = 0; + + #warn "$1\n"; sleep(1); + # C on forward strand + my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + my $strand_top = '+'; + + # C on reverse strand + my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! + $tri_nt_bottom = reverse $tri_nt_bottom; + $tri_nt_bottom =~ tr/ACTG/TGAC/; + my $strand_bottom = '-'; + + next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand + next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the reverse strand + + # top strand + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! + $meth_top = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; + } + # bottom strand + if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs + $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; + $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; + } + + ### determining cytosine context top strand + if ($tri_nt_top =~ /^CG/){ + $context_top = 'CG'; + } + elsif ($tri_nt_top =~ /^C.{1}G$/){ + $context_top = 'CHG'; + } + elsif ($tri_nt_top =~ /^C.{2}$/){ + $context_top = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; + next; + } + + ### determining cytosine context bottom strand + if ($tri_nt_bottom =~ /^CG/){ + $context_bottom = 'CG'; + } + elsif ($tri_nt_bottom =~ /^C.{1}G$/){ + $context_bottom = 'CHG'; + } + elsif ($tri_nt_bottom =~ /^C.{2}$/){ + $context_bottom = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; + next; + } + + # if Cs were not covered at all they are not written out + unless ($meth_bottom + $nonmeth_bottom == 0){ + my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); + print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; + print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; + } + unless ($meth_top + $nonmeth_top == 0){ + my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); + print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; + print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; + } + + } + + %chr = (); # resetting the hash + + # new first entry + $last_chr = $chr; + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + } + + # Last found chromosome + warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /(GC)/g){ + + # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time + my $context_top = ''; + my $context_bottom = ''; + my $pos = pos$chromosomes{$last_chr}; + + my $meth_top = 0; + my $meth_bottom = 0; + my $nonmeth_top = 0; + my $nonmeth_bottom = 0; + + # C on forward strand + my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + my $strand_top = '+'; + + # C on reverse strand + my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! + $tri_nt_bottom = reverse $tri_nt_bottom; + $tri_nt_bottom =~ tr/ACTG/TGAC/; + my $strand_bottom = '-'; + + next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand + next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the bottom strand + + ### determining cytosine context top strand + if ($tri_nt_top =~ /^CG/){ + $context_top = 'CG'; + } + elsif ($tri_nt_top =~ /^C.{1}G$/){ + $context_top = 'CHG'; + } + elsif ($tri_nt_top =~ /^C.{2}$/){ + $context_top = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; + next; + } + + ### determining cytosine context bottom strand + if ($tri_nt_bottom =~ /^CG/){ + $context_bottom = 'CG'; + } + elsif ($tri_nt_bottom =~ /^C.{1}G$/){ + $context_bottom = 'CHG'; + } + elsif ($tri_nt_bottom =~ /^C.{2}$/){ + $context_bottom = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; + next; + } + + # top strand + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! + $meth_top = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; + } + # bottom strand + if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs + $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; + $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; + } + + # if Cs were not covered at all they are not written out + unless ($meth_bottom + $nonmeth_bottom == 0){ + my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); + print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; + print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; + } + unless ($meth_top + $nonmeth_top == 0){ + my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); + print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; + print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; + } + } + + warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; + + close GC or warn $!; + +} + +#### +#### + + +sub process_unprocessed_chromosomes{ + + my $chr = shift; + + warn "Writing cytosine report for not covered chromosome $chr\n"; + $processed{$chr} = 1; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt + $chromosome_out =~ s/txt$/chr${chr}.txt/; + } + else{ # user specified output file name + $chromosome_out =~ s/$/.chr${chr}.txt/; + } + + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + while ( $chromosomes{$chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + if ($tetra){ + $tetra_nt = ''; # clearing + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$chr},($pos-1),3); # positions are 0-based! + $strand = '+'; + if ($tetra){ + if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + } + elsif ($1 eq 'G'){ # C on reverse strand + $tri_nt = substr ($chromosomes{$chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + $strand = '-'; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + $strand = '-'; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero-based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + # close CYT or warn $!; + +} + + +sub read_genome_into_memory{ + + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + # skipping the tophat entire mouse genome fasta file + next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = <CHR_IN>; + chomp $first_line; + $first_line =~ s/\r//; # removing /r carriage returns + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (<CHR_IN>){ + chomp; + $_ =~ s/\r//; # removing /r carriage returns + + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed + } + } + warn "\n"; + chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; +} + +sub extract_chromosome_name { + ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + + +sub print_helpfile{ + + warn <<EOF + + SYNOPSIS: + + This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced + by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only + (for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'. + + The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be + sorted by position and exactly in the following format: + + <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count unmethylated> + + The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!). + + + USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input] + + +-o/--output <filename> Name of the output file, mandatory. + +--dir Output directory. Output is written to the current directory if not specified explicitly. + +--genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +-CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--merge_CpG Using this option will post-process the genome-wide report to write out an additional coverage + file (see above for the coverage file format) which has the top and bottom strand methylation + evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format + for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be: + + genome-wide CpG report (old) + gi|9626372|ref|NC_001422.1| 157 + 313 156 CG + gi|9626372|ref|NC_001422.1| 158 - 335 156 CG + merged CpG evidence coverage file (new) + gi|9626372|ref|NC_001422.1| 157 158 67.500000 648 312 + + This option is currently experimental, and only works if CpG context only and a single genome-wide report + were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome). + +--gc/--gc_context In addition to normal processing this option will reprocess the genome to find methylation in + GpC context. This might be useful for specialist applications where GpC methylases had been + deployed. The output format is exactly the same as for the normal cytosine report, and only + positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition + this will write out a Bismark coverage file (ending in GpC.cov). + +--ff In addition to the standard output selecting --ff will also extract a four and five (tetra/penta) + nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the + chromosome) will be left blank; sequences containing Ns are ignored. + +--zero_based Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + +--gzip Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output. + +--help Displays this help message and exits + + + +OUTPUT FORMAT: + +The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords): +=========================================================================================================== + +<chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> + + + Script last modified: 04 April 2016 + +EOF + ; + exit 1; +} +