0
+ − 1 #!/usr/bin/perl --
+ − 2 use strict;
+ − 3 use warnings;
+ − 4 use Cwd;
+ − 5 use File::Path qw(rmtree);
+ − 6 $|++;
+ − 7
+ − 8
+ − 9 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
+ − 10
+ − 11 ## This program is free software: you can redistribute it and/or modify
+ − 12 ## it under the terms of the GNU General Public License as published by
+ − 13 ## the Free Software Foundation, either version 3 of the License, or
+ − 14 ## (at your option) any later version.
+ − 15
+ − 16 ## This program is distributed in the hope that it will be useful,
+ − 17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
+ − 18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ − 19 ## GNU General Public License for more details.
+ − 20
+ − 21 ## You should have received a copy of the GNU General Public License
+ − 22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
+ − 23
+ − 24 use Getopt::Long;
+ − 25 use Cwd;
+ − 26
+ − 27 my $verbose;
+ − 28 my $help;
+ − 29 my $version;
+ − 30 my $man;
+ − 31 my $path_to_bowtie;
+ − 32 my $multi_fasta;
+ − 33 my $single_fasta;
+ − 34 my $bowtie2;
+ − 35
3
+ − 36 my $bismark_version = 'v0.10.0';
0
+ − 37
+ − 38 GetOptions ('verbose' => \$verbose,
+ − 39 'help' => \$help,
+ − 40 'man' => \$man,
+ − 41 'version' => \$version,
+ − 42 'path_to_bowtie:s' => \$path_to_bowtie,
+ − 43 'single_fasta' => \$single_fasta,
+ − 44 'bowtie2' => \$bowtie2,
+ − 45 );
+ − 46
+ − 47 if ($help or $man){
+ − 48 print_helpfile();
+ − 49 exit;
+ − 50 }
+ − 51
+ − 52 if ($version){
+ − 53 print << "VERSION";
+ − 54
+ − 55 Bismark - Bisulfite Mapper and Methylation Caller.
+ − 56
+ − 57 Bismark Genome Preparation Version: $bismark_version
+ − 58 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
+ − 59 www.bioinformatics.babraham.ac.uk/projects/
+ − 60
+ − 61 VERSION
+ − 62 exit;
+ − 63 }
+ − 64
3
+ − 65 my $genome_folder = shift @ARGV; # mandatory
+ − 66
+ − 67 # Ensuring a genome folder has been specified
+ − 68 if ($genome_folder){
+ − 69 unless ($genome_folder =~ /\/$/){
+ − 70 $genome_folder =~ s/$/\//;
+ − 71 }
+ − 72 $verbose and print "Path to genome folder specified as: $genome_folder\n";
+ − 73 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+ − 74
+ − 75 # making the genome folder path abolsolute so it won't break if the path was specified relative
+ − 76 $genome_folder = getcwd;
+ − 77 unless ($genome_folder =~ /\/$/){
+ − 78 $genome_folder =~ s/$/\//;
+ − 79 }
+ − 80 }
+ − 81 else{
+ − 82 die "Please specify a genome folder to be used for bisulfite conversion\n\n";
+ − 83 }
+ − 84
+ − 85
+ − 86 my $CT_dir;
+ − 87 my $GA_dir;
+ − 88
+ − 89
0
+ − 90 if ($single_fasta){
+ − 91 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
+ − 92 $multi_fasta = 0;
+ − 93 }
+ − 94 else{
+ − 95 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
+ − 96 $single_fasta = 0;
+ − 97 $multi_fasta = 1;
+ − 98 }
+ − 99
+ − 100 my @filenames = create_bisulfite_genome_folders();
+ − 101
+ − 102 process_sequence_files ();
+ − 103
+ − 104 launch_bowtie_indexer();
+ − 105
+ − 106 sub launch_bowtie_indexer{
+ − 107 if ($bowtie2){
+ − 108 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
+ − 109 }
+ − 110 else{
+ − 111 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
+ − 112 }
+ − 113 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
+ − 114 sleep(5);
+ − 115
+ − 116 ### if the path to bowtie was specfified explicitely
+ − 117 if ($path_to_bowtie){
+ − 118 if ($bowtie2){
+ − 119 $path_to_bowtie =~ s/$/bowtie2-build/;
+ − 120 }
+ − 121 else{
+ − 122 $path_to_bowtie =~ s/$/bowtie-build/;
+ − 123 }
+ − 124 }
+ − 125 ### otherwise we assume that bowtie-build is in the path
+ − 126 else{
+ − 127 if ($bowtie2){
+ − 128 $path_to_bowtie = 'bowtie2-build';
+ − 129 }
+ − 130 else{
+ − 131 $path_to_bowtie = 'bowtie-build';
+ − 132 }
+ − 133 }
+ − 134
+ − 135 $verbose and print "\n";
+ − 136
+ − 137 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
+ − 138 my $pid = fork();
+ − 139
+ − 140 # parent process
+ − 141 if ($pid){
+ − 142 sleep(1);
+ − 143 chdir $CT_dir or die "Unable to change directory: $!\n";
+ − 144 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
+ − 145 my @fasta_files = <*.fa>;
+ − 146 my $file_list = join (',',@fasta_files);
+ − 147 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
+ − 148 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
+ − 149
+ − 150 sleep (11);
+ − 151 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
+ − 152 }
+ − 153
+ − 154 # child process
+ − 155 elsif ($pid == 0){
+ − 156 sleep(2);
+ − 157 chdir $GA_dir or die "Unable to change directory: $!\n";
+ − 158 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
+ − 159 my @fasta_files = <*.fa>;
+ − 160 my $file_list = join (',',@fasta_files);
+ − 161 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
+ − 162 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
+ − 163 $verbose and print "(starting in 10 seconds)\n";
+ − 164 sleep(10);
+ − 165 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+ − 166 }
+ − 167
+ − 168 # if the platform doesn't support the fork command we will run the indexing processes one after the other
+ − 169 else{
+ − 170 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
+ − 171 sleep(10);
+ − 172
+ − 173 ### moving to CT genome folder
+ − 174 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
+ − 175 chdir $CT_dir or die "Unable to change directory: $!\n";
+ − 176 my @fasta_files = <*.fa>;
+ − 177 my $file_list = join (',',@fasta_files);
+ − 178 $verbose and print "$file_list\n\n";
+ − 179 sleep(2);
+ − 180 system ("$path_to_bowtie","-f","$file_list","BS_CT");
+ − 181 @fasta_files=();
+ − 182 $file_list= '';
+ − 183
+ − 184 ### moving to GA genome folder
+ − 185 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
+ − 186 chdir $GA_dir or die "Unable to change directory: $!\n";
+ − 187 @fasta_files = <*.fa>;
+ − 188 $file_list = join (',',@fasta_files);
+ − 189 $verbose and print "$file_list\n\n";
+ − 190 sleep(2);
+ − 191 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+ − 192 }
+ − 193 }
+ − 194
+ − 195
+ − 196 sub process_sequence_files {
+ − 197
+ − 198 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
+ − 199 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
+ − 200 sleep (3);
+ − 201
+ − 202 $verbose and print "conversions performed:\n";
+ − 203 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
+ − 204
+ − 205
+ − 206 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
+ − 207 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
+ − 208 ### This is now the default option
+ − 209
+ − 210 if ($multi_fasta){
+ − 211 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
+ − 212 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
+ − 213 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+ − 214
+ − 215 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
+ − 216 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ − 217 }
+ − 218
+ − 219 foreach my $filename(@filenames){
+ − 220 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+ − 221 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
+ − 222 # warn "Reading chromosome information from $filename\n\n";
+ − 223
+ − 224 ### first line needs to be a fastA header
+ − 225 my $first_line = <IN>;
+ − 226 chomp $first_line;
+ − 227
+ − 228 ### Extracting chromosome name from the FastA header
+ − 229 my $chromosome_name = extract_chromosome_name($first_line);
+ − 230
+ − 231 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+ − 232 unless ($multi_fasta){
+ − 233 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+ − 234 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+ − 235 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+ − 236
+ − 237 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+ − 238 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+ − 239 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ − 240 }
+ − 241
+ − 242 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
+ − 243 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
+ − 244
+ − 245
+ − 246 while (<IN>){
+ − 247
+ − 248 ### in case the line is a new fastA header
+ − 249 if ($_ =~ /^>/){
+ − 250 ### printing out the stats for the previous chromosome
+ − 251 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+ − 252 ### resetting the chromosome transliteration counters
+ − 253 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+ − 254
+ − 255 ### Extracting chromosome name from the additional FastA header
+ − 256 $chromosome_name = extract_chromosome_name($_);
+ − 257
+ − 258 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+ − 259 unless ($multi_fasta){
+ − 260 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+ − 261 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+ − 262 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+ − 263
+ − 264 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+ − 265 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+ − 266 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ − 267 }
+ − 268
+ − 269 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
+ − 270 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
+ − 271 }
+ − 272
+ − 273 else{
+ − 274 my $sequence = uc$_;
+ − 275
+ − 276 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
+ − 277
+ − 278 $sequence =~ s/[^ATCGN\n\r]/N/g;
+ − 279
+ − 280 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
+ − 281
+ − 282 my $CT_sequence = $sequence;
+ − 283 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
+ − 284 $total_CT_conversions += $CT_transliterations_performed;
+ − 285 $chromosome_CT_conversions += $CT_transliterations_performed;
+ − 286
+ − 287 print CT_CONVERT $CT_sequence;
+ − 288
+ − 289 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
+ − 290 ### complementing the forward strand and then C->T converting it)
+ − 291
+ − 292 my $GA_sequence = $sequence;
+ − 293 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
+ − 294 $total_GA_conversions += $GA_transliterations_performed;
+ − 295 $chromosome_GA_conversions += $GA_transliterations_performed;
+ − 296
+ − 297 print GA_CONVERT $GA_sequence;
+ − 298
+ − 299 }
+ − 300 }
+ − 301 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+ − 302 }
+ − 303 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
+ − 304 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
+ − 305
+ − 306
+ − 307 print "\nTotal number of conversions performed:\n";
+ − 308 print "C->T:\t$total_CT_conversions\n";
+ − 309 print "G->A:\t$total_GA_conversions\n";
+ − 310
+ − 311 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
+ − 312 }
+ − 313
+ − 314 sub extract_chromosome_name {
+ − 315
+ − 316 my $header = shift;
+ − 317
+ − 318 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
+ − 319
+ − 320 if ($header =~ s/^>//){
+ − 321 my ($chromosome_name) = split (/\s+/,$header);
+ − 322 return $chromosome_name;
+ − 323 }
+ − 324 else{
+ − 325 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
+ − 326 }
+ − 327 }
+ − 328
+ − 329 sub create_bisulfite_genome_folders{
+ − 330
+ − 331 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
+ − 332
+ − 333 if ($path_to_bowtie){
+ − 334 unless ($path_to_bowtie =~ /\/$/){
+ − 335 $path_to_bowtie =~ s/$/\//;
+ − 336 }
+ − 337 if (chdir $path_to_bowtie){
+ − 338 if ($bowtie2){
+ − 339 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
+ − 340 }
+ − 341 else{
+ − 342 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
+ − 343 }
+ − 344 }
+ − 345 else{
+ − 346 die "There was an error with the path to bowtie: $!\n";
+ − 347 }
+ − 348 }
+ − 349
+ − 350 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+ − 351
+ − 352
+ − 353 # Exiting unless there are fastA files in the folder
+ − 354 my @filenames = <*.fa>;
+ − 355
+ − 356 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+ − 357 unless (@filenames){
+ − 358 @filenames = <*.fasta>;
+ − 359 }
+ − 360
+ − 361 unless (@filenames){
+ − 362 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
+ − 363 }
+ − 364
3
+ − 365 warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
0
+ − 366 sleep (3);
+ − 367
+ − 368 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
+ − 369 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
+ − 370 unless (-d $bisulfite_dir){
+ − 371 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
+ − 372 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
+ − 373 }
+ − 374 else{
3
+ − 375 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
+ − 376 sleep(5);
0
+ − 377 }
+ − 378
+ − 379 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
+ − 380 $CT_dir = "${bisulfite_dir}CT_conversion/";
+ − 381 $GA_dir = "${bisulfite_dir}GA_conversion/";
+ − 382
+ − 383 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
+ − 384 # converted version of the genome
+ − 385 unless (-d $CT_dir){
+ − 386 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
+ − 387 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
+ − 388 }
+ − 389 unless (-d $GA_dir){
+ − 390 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
+ − 391 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
+ − 392 }
+ − 393
+ − 394 # moving back to the original genome folder
+ − 395 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
+ − 396 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
+ − 397 warn "\nStep I - Prepare genome folders - completed\n\n\n";
+ − 398 return @filenames;
+ − 399 }
+ − 400
+ − 401 sub print_helpfile{
+ − 402 print << 'HOW_TO';
+ − 403
+ − 404
+ − 405 DESCRIPTION
+ − 406
+ − 407 This script is supposed to convert a specified reference genome into two different bisulfite
+ − 408 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
+ − 409 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
+ − 410 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
+ − 411 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
3
+ − 412 two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
0
+ − 413 that the indexing process can take up to several hours; this will mainly depend on genome size
+ − 414 and system resources.
+ − 415
+ − 416
+ − 417
+ − 418 The following is a brief description of command line options and arguments to control the
3
+ − 419 Bismark Genome Preparation:
0
+ − 420
+ − 421
+ − 422 USAGE: bismark_genome_preparation [options] <arguments>
+ − 423
+ − 424
+ − 425 OPTIONS:
+ − 426
+ − 427 --help/--man Displays this help filea and exits.
+ − 428
+ − 429 --version Displays version information and exits.
+ − 430
+ − 431 --verbose Print verbose output for more details or debugging.
+ − 432
3
+ − 433 --path_to_bowtie </../> The full path to the Bowtie 1 or Bowtie 2 installation on your system
+ − 434 (depending on which aligner/indexer you intend to use). Unless this path
+ − 435 is specified it is assumed that Bowtie is in the PATH.
0
+ − 436
+ − 437 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
+ − 438
+ − 439 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
+ − 440 single-entry FastA files instead of making one multi-FastA file (MFA)
+ − 441 per chromosome. This might be useful if individual bisulfite converted
+ − 442 chromosomes are needed (e.g. for debugging), however it can cause a
+ − 443 problem with indexing if the number of chromosomes is vast (this is likely
+ − 444 to be in the range of several thousand files; the operating system can
+ − 445 only handle lists up to a certain length, and some newly assembled
+ − 446 genomes may contain 20000-50000 contigs of scaffold files which do exceed
+ − 447 this list length limit).
+ − 448
+ − 449
+ − 450 ARGUMENTS:
+ − 451
+ − 452 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
3
+ − 453 The Bismark Genome Preparation expects one or more fastA files in the folder
+ − 454 (with the file extension: .fa or .fasta). Specifying this path is mandatory.
0
+ − 455
+ − 456
3
+ − 457 This script was last modified on 19 Sept 2013.
0
+ − 458 HOW_TO
+ − 459 }