Mercurial > repos > bgruening > bismark

diff bismark_mapping/bismark_genome_preparation @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author: bgruening
date: Sat, 06 May 2017 13:18:09 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bismark_mapping/bismark_genome_preparation	Sat May 06 13:18:09 2017 -0400
@@ -0,0 +1,632 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Cwd;
+use Getopt::Long;
+$|++;
+
+
+## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+my $verbose;
+my $help;
+my $version;
+my $man;
+my $path_to_bowtie;
+my $multi_fasta;
+my $single_fasta;
+my $bowtie2;
+my $bowtie1;
+my $parent_dir = getcwd();
+
+my $genomic_composition;
+my %genomic_freqs; # storing the genomic sequence composition
+my %freqs;
+
+my $bismark_version = 'v0.16.3';
+
+GetOptions ('verbose' => \$verbose,
+	    'help' => \$help,
+	    'man' => \$man,
+	    'version' => \$version,
+	    'path_to_bowtie:s' => \$path_to_bowtie,
+	    'single_fasta' => \$single_fasta,
+	    'bowtie2' => \$bowtie2,
+	    'bowtie1' => \$bowtie1,
+	    'genomic_composition' => \$genomic_composition,
+	   );
+
+if ($help or $man){
+  print_helpfile();
+  exit;
+}
+
+if ($version){
+  print << "VERSION";
+
+          Bismark - Bisulfite Mapper and Methylation Caller.
+
+          Bismark Genome Preparation Version: $bismark_version
+        Copyright 2010-16 Felix Krueger, Babraham Bioinformatics
+              www.bioinformatics.babraham.ac.uk/projects/
+
+VERSION
+  exit;
+}
+
+my $genome_folder = shift @ARGV; # mandatory
+my %chromosomes; # checking if chromosome names are unique (required)
+
+# Ensuring a genome folder has been specified
+if ($genome_folder){
+    unless ($genome_folder =~ /\/$/){
+	$genome_folder =~ s/$/\//;
+    }
+    $verbose and print "Path to genome folder specified as: $genome_folder\n";
+    chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+    
+    # making the genome folder path abolsolute so it won't break if the path was specified relative
+    $genome_folder = getcwd();
+    unless ($genome_folder =~ /\/$/){
+	$genome_folder =~ s/$/\//;
+    }
+}
+else{
+    die "Please specify a genome folder to be used for bisulfite conversion\n\n";
+}
+
+
+my $CT_dir;
+my $GA_dir;
+
+if ($bowtie1){
+  if ($bowtie2){
+    die "You may not select both --bowtie1 and --bowtie2. Make your pick! (default is Bowtie2)\n";
+  }
+  $bowtie2 = 0;
+  $verbose and print "Aligner to be used: Bowtie (1)\n";
+}
+else{ # Bowtie 2 is now the default mode (as of 27 July 2015)
+  if ($bowtie2){
+    $verbose and print "Aligner to be used: Bowtie 2 (user-defined)\n";
+  }
+  else{
+    $verbose and print "Aligner to be used: Bowtie 2 (default)\n";
+  }
+  $bowtie2 = 1;
+}
+
+if ($single_fasta){
+  warn "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
+  $multi_fasta = 0;
+}
+else{
+  warn "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
+  $single_fasta = 0;
+  $multi_fasta = 1;
+}
+
+my @filenames = create_bisulfite_genome_folders();
+
+if ($genomic_composition){
+    get_genomic_frequencies();
+    warn "Finished processing genomic nucleotide frequencies\n\n";
+    %chromosomes = (); # resetting
+}
+
+process_sequence_files ();
+
+launch_bowtie_indexer();
+
+sub launch_bowtie_indexer{
+  if ($bowtie2){
+    warn "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
+  }
+  else{
+    warn "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
+  }
+  print "Please be aware that this process can - depending on genome size - take several hours!\n";
+  sleep(1);
+
+  ### if the path to bowtie was specfified explicitely
+  if ($path_to_bowtie){
+    if ($bowtie2){
+      $path_to_bowtie =~ s/$/bowtie2-build/;
+    }
+    else{
+      $path_to_bowtie =~ s/$/bowtie-build/;
+    }
+  }
+  ### otherwise we assume that bowtie-build is in the path
+  else{
+    if ($bowtie2){
+      $path_to_bowtie = 'bowtie2-build';
+    }
+    else{
+      $path_to_bowtie = 'bowtie-build';
+    }
+  }
+
+  $verbose and print "\n";
+
+  ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
+  my $pid = fork();
+
+  # parent process
+  if ($pid){
+    sleep(1);
+    chdir $CT_dir or die "Unable to change directory: $!\n";
+    $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
+    my @fasta_files = <*.fa>;
+    my $file_list = join (',',@fasta_files);
+    $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
+    $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
+
+    system ("$path_to_bowtie","-f","$file_list","BS_CT")== 0 or die "Parent process: Failed to build index\n";
+    waitpid( $pid, 0 );
+    $? == 0 or die "Parent process: Child process failed\n";
+  }
+
+  # child process
+  elsif ($pid == 0){
+    sleep(2);
+    chdir $GA_dir or die "Unable to change directory: $!\n";
+    $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
+    my @fasta_files = <*.fa>;
+    my $file_list = join (',',@fasta_files);
+    $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
+    $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
+
+    exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+  }
+
+  # if the platform doesn't support the fork command we will run the indexing processes one after the other
+  else{
+    print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
+    # sleep(10);
+
+    ### moving to CT genome folder
+    $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
+    chdir $CT_dir or die "Unable to change directory: $!\n";
+    my @fasta_files = <*.fa>;
+    my $file_list = join (',',@fasta_files);
+    $verbose and print "$file_list\n\n";
+    # sleep(2);
+    system ("$path_to_bowtie","-f","$file_list","BS_CT");
+    @fasta_files=();
+    $file_list= '';
+
+    ### moving to GA genome folder
+    $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
+    chdir $GA_dir or die "Unable to change directory: $!\n";
+    @fasta_files = <*.fa>;
+    $file_list = join (',',@fasta_files);
+    $verbose and print "$file_list\n\n";
+    # sleep(2);
+    exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+  }
+}
+
+
+sub process_sequence_files {
+
+  my ($total_CT_conversions,$total_GA_conversions) = (0,0);
+  $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
+  sleep (3);
+
+  $verbose and print "conversions performed:\n";
+  $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
+
+
+  ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
+  ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
+  ### This is now the default option
+
+  if ($multi_fasta){
+    ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
+    my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
+    open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+
+    my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
+    open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+  }
+
+  foreach my $filename(@filenames){
+    my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+    open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
+    # warn "Reading chromosome information from $filename\n\n";
+
+    ### first line needs to be a fastA header
+    my $first_line = <IN>;
+    chomp $first_line;
+
+    ### Extracting chromosome name from the FastA header
+    my $chromosome_name = extract_chromosome_name($first_line);
+
+    ### Exiting if a chromosome with the same name was present already
+    if (exists $chromosomes{$chromosome_name}){
+      die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+    }
+    else{
+      $chromosomes{$chromosome_name}++;
+    }
+
+    ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+    unless ($multi_fasta){
+      my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+      $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+      open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+
+      my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+      $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+      open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+    }
+
+    print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
+    print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
+
+
+    while (<IN>){
+
+      ### in case the line is a new fastA header
+      if ($_ =~ /^>/){
+	### printing out the stats for the previous chromosome
+	$verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+	### resetting the chromosome transliteration counters
+	($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+	
+	### Extracting chromosome name from the additional FastA header
+	$chromosome_name = extract_chromosome_name($_);
+
+	### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+	unless ($multi_fasta){
+	  my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+	  $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+	  open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+	
+	  my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+	  $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+	  open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+	}
+
+	print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
+	print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
+      }
+
+      else{
+	my $sequence = uc$_;
+
+	### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
+	
+	$sequence =~ s/[^ATCGN\n\r]/N/g;
+	
+	### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
+	
+	my $CT_sequence = $sequence;
+	my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
+	$total_CT_conversions += $CT_transliterations_performed;
+	$chromosome_CT_conversions += $CT_transliterations_performed;
+	
+	print CT_CONVERT $CT_sequence;
+	
+	### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
+	### complementing the forward strand and then C->T converting it)
+	
+	my $GA_sequence = $sequence;
+	my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
+	$total_GA_conversions += $GA_transliterations_performed;
+	$chromosome_GA_conversions += $GA_transliterations_performed;
+	
+	print GA_CONVERT $GA_sequence;
+	
+      }
+    }
+    $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+  }
+  close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
+  close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
+
+
+  print "\nTotal number of conversions performed:\n";
+  print "C->T:\t$total_CT_conversions\n";
+  print "G->A:\t$total_GA_conversions\n";
+
+  warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
+}
+
+sub get_genomic_frequencies{
+    
+    warn "Calculating genomic frequencies (this may take several minutes depending on genome size) ...\n";
+    warn "="x164,"\n";
+    read_genome_into_memory($parent_dir);
+    foreach my $chr (keys %chromosomes){
+	warn "Processing chromosome >> $chr <<\n";
+	process_sequence($chromosomes{$chr});
+    }
+    
+    %genomic_freqs = %freqs;
+    
+    ### Attempting to write a genomic nucleotide frequency table out to the genome folder so we can re-use it next time without the need to re-calculate
+    ### if this fails
+    if  ( open (FREQS,'>',"${genome_folder}genomic_nucleotide_frequencies.txt") ){
+	warn "Writing genomic nucleotide frequencies to the file >${genome_folder}genomic_nucleotide_frequencies.txt< for future re-use\n";
+	foreach my $f(sort keys %genomic_freqs){
+	    warn "Writing count of (di-)nucleotide: $f\t$genomic_freqs{$f}\n";
+	    print FREQS "$f\t$genomic_freqs{$f}\n";
+	}
+	close FREQS or warn "Failed to close filehandle FREQS: $!\n\n";
+    }
+    else{
+	warn "Failed to write out file ${genome_folder}genomic_nucleotide_frequencies.txt because of: $!. Skipping writing out genomic frequency table\n\n";
+    }
+}
+
+
+sub process_sequence{
+    
+    my $seq = shift;
+    my $mono;
+    my $di;
+
+    foreach my $index (0..(length$seq)-1){
+	my $counted = 0;
+	if ($index%10000000==0){
+	    # warn "Current index number is $index\n";
+	}
+	
+	$mono = substr($seq,$index,1);
+	unless ( $mono eq 'N'){
+	    $freqs{$mono}++;
+	}
+	
+	unless ( ($index + 2) > length$seq ){
+	    $di = substr($seq,$index,2);
+	    if (index($di,'N') < 0) {
+		$freqs{$di}++;
+	    }
+	}
+    }
+    
+}
+
+sub extract_chromosome_name {
+  ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
+  my $fasta_header = shift;
+  if ($fasta_header =~ s/^>//){
+    my ($chromosome_name) = split (/\s+/,$fasta_header);
+    return $chromosome_name;
+  }
+  else{
+    die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
+  }
+}
+
+
+sub create_bisulfite_genome_folders{
+
+  $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
+
+  if ($path_to_bowtie){
+    unless ($path_to_bowtie =~ /\/$/){
+      $path_to_bowtie =~ s/$/\//;
+    }
+    if (chdir $path_to_bowtie){
+      if ($bowtie2){
+	$verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
+      }
+      else{
+	$verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
+      }
+    }
+    else{
+      die "There was an error with the path to bowtie: $!\n";
+    }
+  }
+
+  chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+
+
+  # Exiting unless there are fastA files in the folder
+  my @filenames = <*.fa>;
+
+  ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+  unless (@filenames){
+    @filenames =  <*.fasta>;
+  }
+
+  unless (@filenames){
+    die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
+  }
+
+  warn "Bisulfite Genome Indexer version $bismark_version (last modified 25 August 2015)\n\n";
+  sleep (3);
+
+  # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
+  my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
+  unless (-d $bisulfite_dir){
+    mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
+    $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
+  }
+  else{
+    print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
+    sleep(5);
+  }
+
+  chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
+  $CT_dir = "${bisulfite_dir}CT_conversion/";
+  $GA_dir = "${bisulfite_dir}GA_conversion/";
+
+  # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
+  # converted version of the genome
+  unless (-d $CT_dir){
+    mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
+    $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
+  }
+  unless (-d $GA_dir){
+    mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
+    $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
+  }
+
+  # moving back to the original genome folder
+  chdir $genome_folder or die "Could't move to directory $genome_folder $!";
+  # $verbose and print "Moved back to genome folder folder $genome_folder\n";
+  warn "\nStep I - Prepare genome folders - completed\n\n\n";
+  return @filenames;
+}
+
+sub read_genome_into_memory{
+
+  ## reading in and storing the specified genome in the %chromosomes hash
+  chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
+  warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
+
+  my @chromosome_filenames =  <*.fa>;
+
+  ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+  unless (@chromosome_filenames){
+    @chromosome_filenames =  <*.fasta>;
+  }
+  unless (@chromosome_filenames){
+    die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
+  }
+
+  foreach my $chromosome_filename (@chromosome_filenames){
+
+    # skipping the tophat entire mouse genome fasta file
+    next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
+
+    open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
+    ### first line needs to be a fastA header
+    my $first_line = <CHR_IN>;
+    chomp $first_line;
+    $first_line =~ s/\r//; # removing /r carriage returns
+
+    ### Extracting chromosome name from the FastA header
+    my $chromosome_name = extract_chromosome_name($first_line);
+	
+    my $sequence;
+    while (<CHR_IN>){
+      chomp;
+      $_ =~ s/\r//; # removing /r carriage returns
+
+      if ($_ =~ /^>/){
+	### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
+	if (exists $chromosomes{$chromosome_name}){
+	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
+	  die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+	}
+	else {
+	  if (length($sequence) == 0){
+	    warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
+	  }
+	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
+	  $chromosomes{$chromosome_name} = $sequence;
+	}
+	### resetting the sequence variable
+	$sequence = '';
+	### setting new chromosome name
+	$chromosome_name = extract_chromosome_name($_);
+      }
+      else{
+	$sequence .= uc$_;
+      }
+    }
+
+    if (exists $chromosomes{$chromosome_name}){
+      warn "chr $chromosome_name (",length $sequence ," bp)\t";
+      die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
+    }
+    else{
+      if (length($sequence) == 0){
+	warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
+      }
+      warn "chr $chromosome_name (",length $sequence ," bp)\n";
+      $chromosomes{$chromosome_name} = $sequence;
+    }
+  }
+  warn "\n";
+  chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
+}
+
+
+
+
+sub print_helpfile{
+  print << 'HOW_TO';
+
+
+DESCRIPTION
+
+This script is supposed to convert a specified reference genome into two different bisulfite
+converted versions and index them for alignments with Bowtie 2 (default), or Bowtie 1. The first
+bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
+converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
+genome folder. Once the bisulfite conversion has been completed the program will fork and launch
+two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
+that the indexing process can take up to several hours; this will mainly depend on genome size
+and system resources.
+
+
+
+The following is a brief description of command line options and arguments to control the
+Bismark Genome Preparation:
+
+
+USAGE: bismark_genome_preparation [options] <arguments>
+
+
+OPTIONS:
+
+--help/--man             Displays this help filea and exits.
+
+--version                Displays version information and exits.
+
+--verbose                Print verbose output for more details or debugging.
+
+--path_to_bowtie </../>  The full path to the Bowtie 1 or Bowtie 2 installation on your system
+                         (depending on which aligner/indexer you intend to use). Unless this path
+                         is specified it is assumed that Bowtie is in the PATH.
+
+--bowtie2                This will create bisulfite indexes for Bowtie 2. (Default: ON).
+
+--bowtie1                This will create bisulfite indexes for Bowtie 1. (Default: OFF).
+
+--single_fasta           Instruct the Bismark Indexer to write the converted genomes into
+                         single-entry FastA files instead of making one multi-FastA file (MFA)
+                         per chromosome. This might be useful if individual bisulfite converted
+                         chromosomes are needed (e.g. for debugging), however it can cause a
+                         problem with indexing if the number of chromosomes is vast (this is likely
+                         to be in the range of several thousand files; the operating system can
+                         only handle lists up to a certain length, and some newly assembled
+                         genomes may contain 20000-500000 contigs of scaffold files which do exceed
+                         this list length limit).
+
+--genomic_composition    Calculate and extract the genomic sequence composition for mono and di-nucleotides
+                         and write the genomic composition table 'genomic_nucleotide_frequencies.txt' to the
+                         genome folder. This may be useful later on when using bam2nuc or the Bismark option
+                         --nucleotide_coverage.
+
+
+ARGUMENTS:
+
+<path_to_genome_folder>  The path to the folder containing the genome to be bisulfite converted.
+                         The Bismark Genome Preparation expects one or more fastA files in the folder
+                         (with the file extension: .fa or .fasta). Specifying this path is mandatory.
+
+
+This script was last modified on 07 July 2016.
+HOW_TO
+}
author	bgruening
date	Sat, 06 May 2017 13:18:09 -0400
parents
children