Mercurial > repos > bgruening > bismark
diff bismark_mapping/bismark_genome_preparation @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_mapping/bismark_genome_preparation Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,632 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Cwd; +use Getopt::Long; +$|++; + + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see <http://www.gnu.org/licenses/>. + +my $verbose; +my $help; +my $version; +my $man; +my $path_to_bowtie; +my $multi_fasta; +my $single_fasta; +my $bowtie2; +my $bowtie1; +my $parent_dir = getcwd(); + +my $genomic_composition; +my %genomic_freqs; # storing the genomic sequence composition +my %freqs; + +my $bismark_version = 'v0.16.3'; + +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'man' => \$man, + 'version' => \$version, + 'path_to_bowtie:s' => \$path_to_bowtie, + 'single_fasta' => \$single_fasta, + 'bowtie2' => \$bowtie2, + 'bowtie1' => \$bowtie1, + 'genomic_composition' => \$genomic_composition, + ); + +if ($help or $man){ + print_helpfile(); + exit; +} + +if ($version){ + print << "VERSION"; + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Genome Preparation Version: $bismark_version + Copyright 2010-16 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + +VERSION + exit; +} + +my $genome_folder = shift @ARGV; # mandatory +my %chromosomes; # checking if chromosome names are unique (required) + +# Ensuring a genome folder has been specified +if ($genome_folder){ + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } + $verbose and print "Path to genome folder specified as: $genome_folder\n"; + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + # making the genome folder path abolsolute so it won't break if the path was specified relative + $genome_folder = getcwd(); + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } +} +else{ + die "Please specify a genome folder to be used for bisulfite conversion\n\n"; +} + + +my $CT_dir; +my $GA_dir; + +if ($bowtie1){ + if ($bowtie2){ + die "You may not select both --bowtie1 and --bowtie2. Make your pick! (default is Bowtie2)\n"; + } + $bowtie2 = 0; + $verbose and print "Aligner to be used: Bowtie (1)\n"; +} +else{ # Bowtie 2 is now the default mode (as of 27 July 2015) + if ($bowtie2){ + $verbose and print "Aligner to be used: Bowtie 2 (user-defined)\n"; + } + else{ + $verbose and print "Aligner to be used: Bowtie 2 (default)\n"; + } + $bowtie2 = 1; +} + +if ($single_fasta){ + warn "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; + $multi_fasta = 0; +} +else{ + warn "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; + $single_fasta = 0; + $multi_fasta = 1; +} + +my @filenames = create_bisulfite_genome_folders(); + +if ($genomic_composition){ + get_genomic_frequencies(); + warn "Finished processing genomic nucleotide frequencies\n\n"; + %chromosomes = (); # resetting +} + +process_sequence_files (); + +launch_bowtie_indexer(); + +sub launch_bowtie_indexer{ + if ($bowtie2){ + warn "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; + } + else{ + warn "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; + } + print "Please be aware that this process can - depending on genome size - take several hours!\n"; + sleep(1); + + ### if the path to bowtie was specfified explicitely + if ($path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie =~ s/$/bowtie2-build/; + } + else{ + $path_to_bowtie =~ s/$/bowtie-build/; + } + } + ### otherwise we assume that bowtie-build is in the path + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2-build'; + } + else{ + $path_to_bowtie = 'bowtie-build'; + } + } + + $verbose and print "\n"; + + ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) + my $pid = fork(); + + # parent process + if ($pid){ + sleep(1); + chdir $CT_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; + + system ("$path_to_bowtie","-f","$file_list","BS_CT")== 0 or die "Parent process: Failed to build index\n"; + waitpid( $pid, 0 ); + $? == 0 or die "Parent process: Child process failed\n"; + } + + # child process + elsif ($pid == 0){ + sleep(2); + chdir $GA_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; + + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } + + # if the platform doesn't support the fork command we will run the indexing processes one after the other + else{ + print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; + # sleep(10); + + ### moving to CT genome folder + $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; + chdir $CT_dir or die "Unable to change directory: $!\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + # sleep(2); + system ("$path_to_bowtie","-f","$file_list","BS_CT"); + @fasta_files=(); + $file_list= ''; + + ### moving to GA genome folder + $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; + chdir $GA_dir or die "Unable to change directory: $!\n"; + @fasta_files = <*.fa>; + $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + # sleep(2); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } +} + + +sub process_sequence_files { + + my ($total_CT_conversions,$total_GA_conversions) = (0,0); + $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; + sleep (3); + + $verbose and print "conversions performed:\n"; + $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; + + + ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file + ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle + ### This is now the default option + + if ($multi_fasta){ + ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files + my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + foreach my $filename(@filenames){ + my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; + # warn "Reading chromosome information from $filename\n\n"; + + ### first line needs to be a fastA header + my $first_line = <IN>; + chomp $first_line; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + ### Exiting if a chromosome with the same name was present already + if (exists $chromosomes{$chromosome_name}){ + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else{ + $chromosomes{$chromosome_name}++; + } + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry + + + while (<IN>){ + + ### in case the line is a new fastA header + if ($_ =~ /^>/){ + ### printing out the stats for the previous chromosome + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + ### resetting the chromosome transliteration counters + ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + + ### Extracting chromosome name from the additional FastA header + $chromosome_name = extract_chromosome_name($_); + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; + } + + else{ + my $sequence = uc$_; + + ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) + + $sequence =~ s/[^ATCGN\n\r]/N/g; + + ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) + + my $CT_sequence = $sequence; + my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts + $total_CT_conversions += $CT_transliterations_performed; + $chromosome_CT_conversions += $CT_transliterations_performed; + + print CT_CONVERT $CT_sequence; + + ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- + ### complementing the forward strand and then C->T converting it) + + my $GA_sequence = $sequence; + my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand + $total_GA_conversions += $GA_transliterations_performed; + $chromosome_GA_conversions += $GA_transliterations_performed; + + print GA_CONVERT $GA_sequence; + + } + } + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + } + close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; + close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; + + + print "\nTotal number of conversions performed:\n"; + print "C->T:\t$total_CT_conversions\n"; + print "G->A:\t$total_GA_conversions\n"; + + warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; +} + +sub get_genomic_frequencies{ + + warn "Calculating genomic frequencies (this may take several minutes depending on genome size) ...\n"; + warn "="x164,"\n"; + read_genome_into_memory($parent_dir); + foreach my $chr (keys %chromosomes){ + warn "Processing chromosome >> $chr <<\n"; + process_sequence($chromosomes{$chr}); + } + + %genomic_freqs = %freqs; + + ### Attempting to write a genomic nucleotide frequency table out to the genome folder so we can re-use it next time without the need to re-calculate + ### if this fails + if ( open (FREQS,'>',"${genome_folder}genomic_nucleotide_frequencies.txt") ){ + warn "Writing genomic nucleotide frequencies to the file >${genome_folder}genomic_nucleotide_frequencies.txt< for future re-use\n"; + foreach my $f(sort keys %genomic_freqs){ + warn "Writing count of (di-)nucleotide: $f\t$genomic_freqs{$f}\n"; + print FREQS "$f\t$genomic_freqs{$f}\n"; + } + close FREQS or warn "Failed to close filehandle FREQS: $!\n\n"; + } + else{ + warn "Failed to write out file ${genome_folder}genomic_nucleotide_frequencies.txt because of: $!. Skipping writing out genomic frequency table\n\n"; + } +} + + +sub process_sequence{ + + my $seq = shift; + my $mono; + my $di; + + foreach my $index (0..(length$seq)-1){ + my $counted = 0; + if ($index%10000000==0){ + # warn "Current index number is $index\n"; + } + + $mono = substr($seq,$index,1); + unless ( $mono eq 'N'){ + $freqs{$mono}++; + } + + unless ( ($index + 2) > length$seq ){ + $di = substr($seq,$index,2); + if (index($di,'N') < 0) { + $freqs{$di}++; + } + } + } + +} + +sub extract_chromosome_name { + ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + + +sub create_bisulfite_genome_folders{ + + $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; + + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (chdir $path_to_bowtie){ + if ($bowtie2){ + $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; + } + else{ + $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; + } + } + else{ + die "There was an error with the path to bowtie: $!\n"; + } + } + + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + + # Exiting unless there are fastA files in the folder + my @filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@filenames){ + @filenames = <*.fasta>; + } + + unless (@filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; + } + + warn "Bisulfite Genome Indexer version $bismark_version (last modified 25 August 2015)\n\n"; + sleep (3); + + # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists + my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; + unless (-d $bisulfite_dir){ + mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; + } + else{ + print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; + sleep(5); + } + + chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; + $CT_dir = "${bisulfite_dir}CT_conversion/"; + $GA_dir = "${bisulfite_dir}GA_conversion/"; + + # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) + # converted version of the genome + unless (-d $CT_dir){ + mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; + } + unless (-d $GA_dir){ + mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; + } + + # moving back to the original genome folder + chdir $genome_folder or die "Could't move to directory $genome_folder $!"; + # $verbose and print "Moved back to genome folder folder $genome_folder\n"; + warn "\nStep I - Prepare genome folders - completed\n\n\n"; + return @filenames; +} + +sub read_genome_into_memory{ + + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + # skipping the tophat entire mouse genome fasta file + next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = <CHR_IN>; + chomp $first_line; + $first_line =~ s/\r//; # removing /r carriage returns + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (<CHR_IN>){ + chomp; + $_ =~ s/\r//; # removing /r carriage returns + + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + } + warn "\n"; + chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; +} + + + + +sub print_helpfile{ + print << 'HOW_TO'; + + +DESCRIPTION + +This script is supposed to convert a specified reference genome into two different bisulfite +converted versions and index them for alignments with Bowtie 2 (default), or Bowtie 1. The first +bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs +converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference +genome folder. Once the bisulfite conversion has been completed the program will fork and launch +two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware +that the indexing process can take up to several hours; this will mainly depend on genome size +and system resources. + + + +The following is a brief description of command line options and arguments to control the +Bismark Genome Preparation: + + +USAGE: bismark_genome_preparation [options] <arguments> + + +OPTIONS: + +--help/--man Displays this help filea and exits. + +--version Displays version information and exits. + +--verbose Print verbose output for more details or debugging. + +--path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system + (depending on which aligner/indexer you intend to use). Unless this path + is specified it is assumed that Bowtie is in the PATH. + +--bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: ON). + +--bowtie1 This will create bisulfite indexes for Bowtie 1. (Default: OFF). + +--single_fasta Instruct the Bismark Indexer to write the converted genomes into + single-entry FastA files instead of making one multi-FastA file (MFA) + per chromosome. This might be useful if individual bisulfite converted + chromosomes are needed (e.g. for debugging), however it can cause a + problem with indexing if the number of chromosomes is vast (this is likely + to be in the range of several thousand files; the operating system can + only handle lists up to a certain length, and some newly assembled + genomes may contain 20000-500000 contigs of scaffold files which do exceed + this list length limit). + +--genomic_composition Calculate and extract the genomic sequence composition for mono and di-nucleotides + and write the genomic composition table 'genomic_nucleotide_frequencies.txt' to the + genome folder. This may be useful later on when using bam2nuc or the Bismark option + --nucleotide_coverage. + + +ARGUMENTS: + +<path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted. + The Bismark Genome Preparation expects one or more fastA files in the folder + (with the file extension: .fa or .fasta). Specifying this path is mandatory. + + +This script was last modified on 07 July 2016. +HOW_TO +}