comparison bismark_genome_preparation @ 3:7eefe5d6eecd draft

reupload
author bjoern-gruening
date Tue, 25 Dec 2012 05:54:01 -0500
parents 36d124f44c0a
children
comparison
equal deleted inserted replaced
2:371bb57ac729 3:7eefe5d6eecd
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use Cwd;
5 use File::Path qw(rmtree);
6 $|++;
7
8
9 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 use Getopt::Long;
25 use Cwd;
26
27 my $verbose;
28 my $help;
29 my $version;
30 my $man;
31 my $path_to_bowtie;
32 my $multi_fasta;
33 my $single_fasta;
34 my $bowtie2;
35
36 my $bismark_version = 'v0.7.7';
37
38 GetOptions ('verbose' => \$verbose,
39 'help' => \$help,
40 'man' => \$man,
41 'version' => \$version,
42 'path_to_bowtie:s' => \$path_to_bowtie,
43 'single_fasta' => \$single_fasta,
44 'bowtie2' => \$bowtie2,
45 );
46
47 my $genome_folder = shift @ARGV; # mandatory
48 my $CT_dir;
49 my $GA_dir;
50
51 if ($help or $man){
52 print_helpfile();
53 exit;
54 }
55
56 if ($version){
57 print << "VERSION";
58
59 Bismark - Bisulfite Mapper and Methylation Caller.
60
61 Bismark Genome Preparation Version: $bismark_version
62 Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
63 www.bioinformatics.babraham.ac.uk/projects/
64
65 VERSION
66 exit;
67 }
68
69 if ($single_fasta){
70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
71 $multi_fasta = 0;
72 }
73 else{
74 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
75 $single_fasta = 0;
76 $multi_fasta = 1;
77 }
78
79 my @filenames = create_bisulfite_genome_folders();
80
81 process_sequence_files ();
82
83 launch_bowtie_indexer();
84
85 sub launch_bowtie_indexer{
86 if ($bowtie2){
87 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
88 }
89 else{
90 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
91 }
92 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
93 sleep(5);
94
95 ### if the path to bowtie was specfified explicitely
96 if ($path_to_bowtie){
97 if ($bowtie2){
98 $path_to_bowtie =~ s/$/bowtie2-build/;
99 }
100 else{
101 $path_to_bowtie =~ s/$/bowtie-build/;
102 }
103 }
104 ### otherwise we assume that bowtie-build is in the path
105 else{
106 if ($bowtie2){
107 $path_to_bowtie = 'bowtie2-build';
108 }
109 else{
110 $path_to_bowtie = 'bowtie-build';
111 }
112 }
113
114 $verbose and print "\n";
115
116 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
117 my $pid = fork();
118
119 # parent process
120 if ($pid){
121 sleep(1);
122 chdir $CT_dir or die "Unable to change directory: $!\n";
123 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
124 my @fasta_files = <*.fa>;
125 my $file_list = join (',',@fasta_files);
126 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
127 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
128
129 sleep (11);
130 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
131 }
132
133 # child process
134 elsif ($pid == 0){
135 sleep(2);
136 chdir $GA_dir or die "Unable to change directory: $!\n";
137 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
138 my @fasta_files = <*.fa>;
139 my $file_list = join (',',@fasta_files);
140 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
141 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
142 $verbose and print "(starting in 10 seconds)\n";
143 sleep(10);
144 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
145 }
146
147 # if the platform doesn't support the fork command we will run the indexing processes one after the other
148 else{
149 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
150 sleep(10);
151
152 ### moving to CT genome folder
153 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
154 chdir $CT_dir or die "Unable to change directory: $!\n";
155 my @fasta_files = <*.fa>;
156 my $file_list = join (',',@fasta_files);
157 $verbose and print "$file_list\n\n";
158 sleep(2);
159 system ("$path_to_bowtie","-f","$file_list","BS_CT");
160 @fasta_files=();
161 $file_list= '';
162
163 ### moving to GA genome folder
164 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
165 chdir $GA_dir or die "Unable to change directory: $!\n";
166 @fasta_files = <*.fa>;
167 $file_list = join (',',@fasta_files);
168 $verbose and print "$file_list\n\n";
169 sleep(2);
170 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
171 }
172 }
173
174
175 sub process_sequence_files {
176
177 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
178 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
179 sleep (3);
180
181 $verbose and print "conversions performed:\n";
182 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
183
184
185 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
186 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
187 ### This is now the default option
188
189 if ($multi_fasta){
190 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
191 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
192 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
193
194 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
195 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
196 }
197
198 foreach my $filename(@filenames){
199 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
200 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
201 # warn "Reading chromosome information from $filename\n\n";
202
203 ### first line needs to be a fastA header
204 my $first_line = <IN>;
205 chomp $first_line;
206
207 ### Extracting chromosome name from the FastA header
208 my $chromosome_name = extract_chromosome_name($first_line);
209
210 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
211 unless ($multi_fasta){
212 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
213 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
215
216 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
217 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
218 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
219 }
220
221 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
222 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
223
224
225 while (<IN>){
226
227 ### in case the line is a new fastA header
228 if ($_ =~ /^>/){
229 ### printing out the stats for the previous chromosome
230 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
231 ### resetting the chromosome transliteration counters
232 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
233
234 ### Extracting chromosome name from the additional FastA header
235 $chromosome_name = extract_chromosome_name($_);
236
237 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
238 unless ($multi_fasta){
239 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
240 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
242
243 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
244 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
245 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
246 }
247
248 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
249 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
250 }
251
252 else{
253 my $sequence = uc$_;
254
255 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
256
257 $sequence =~ s/[^ATCGN\n\r]/N/g;
258
259 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
260
261 my $CT_sequence = $sequence;
262 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
263 $total_CT_conversions += $CT_transliterations_performed;
264 $chromosome_CT_conversions += $CT_transliterations_performed;
265
266 print CT_CONVERT $CT_sequence;
267
268 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
269 ### complementing the forward strand and then C->T converting it)
270
271 my $GA_sequence = $sequence;
272 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
273 $total_GA_conversions += $GA_transliterations_performed;
274 $chromosome_GA_conversions += $GA_transliterations_performed;
275
276 print GA_CONVERT $GA_sequence;
277
278 }
279 }
280 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
281 }
282 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
283 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
284
285
286 print "\nTotal number of conversions performed:\n";
287 print "C->T:\t$total_CT_conversions\n";
288 print "G->A:\t$total_GA_conversions\n";
289
290 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
291 }
292
293 sub extract_chromosome_name {
294
295 my $header = shift;
296
297 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
298
299 if ($header =~ s/^>//){
300 my ($chromosome_name) = split (/\s+/,$header);
301 return $chromosome_name;
302 }
303 else{
304 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
305 }
306 }
307
308 sub create_bisulfite_genome_folders{
309
310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
311
312 # Ensuring a genome folder has been specified
313 if ($genome_folder){
314 unless ($genome_folder =~ /\/$/){
315 $genome_folder =~ s/$/\//;
316 }
317 $verbose and print "Path to genome folder specified: $genome_folder\n";
318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
319
320 # making the genome folder path abolsolute so it won't break if the path was specified relative
321 $genome_folder = getcwd;
322 unless ($genome_folder =~ /\/$/){
323 $genome_folder =~ s/$/\//;
324 }
325 }
326
327 else{
328 $verbose and print "Genome folder was not provided as argument ";
329 while (1){
330 print "Please specify a genome folder to be bisulfite converted:\n";
331 $genome_folder = <STDIN>;
332 chomp $genome_folder;
333
334 # adding a trailing slash unless already present
335 unless ($genome_folder =~ /\/$/){
336 $genome_folder =~ s/$/\//;
337 }
338 if (chdir $genome_folder){
339 last;
340 }
341 else{
342 warn "Could't move to directory $genome_folder! $!";
343 }
344 }
345 }
346
347 if ($path_to_bowtie){
348 unless ($path_to_bowtie =~ /\/$/){
349 $path_to_bowtie =~ s/$/\//;
350 }
351 if (chdir $path_to_bowtie){
352 if ($bowtie2){
353 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
354 }
355 else{
356 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
357 }
358 }
359 else{
360 die "There was an error with the path to bowtie: $!\n";
361 }
362 }
363
364 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
365
366
367 # Exiting unless there are fastA files in the folder
368 my @filenames = <*.fa>;
369
370 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
371 unless (@filenames){
372 @filenames = <*.fasta>;
373 }
374
375 unless (@filenames){
376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
377 }
378
379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n";
380 sleep (3);
381
382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
384 unless (-d $bisulfite_dir){
385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
387 }
388 else{
389 while (1){
390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t";
391 my $proceed = <STDIN>;
392 chomp $proceed;
393 if ($proceed =~ /^y/i ){
394 last;
395 }
396 elsif ($proceed =~ /^n/i){
397 die "Terminated by user\n\n";
398 }
399 }
400 }
401
402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already
403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted)
404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1});
405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina
406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
407 # }
408 # }
409
410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
411 $CT_dir = "${bisulfite_dir}CT_conversion/";
412 $GA_dir = "${bisulfite_dir}GA_conversion/";
413
414 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
415 # converted version of the genome
416 unless (-d $CT_dir){
417 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
418 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
419 }
420 unless (-d $GA_dir){
421 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
422 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
423 }
424
425 # moving back to the original genome folder
426 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
427 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
428 warn "\nStep I - Prepare genome folders - completed\n\n\n";
429 return @filenames;
430 }
431
432 sub print_helpfile{
433 print << 'HOW_TO';
434
435
436 DESCRIPTION
437
438 This script is supposed to convert a specified reference genome into two different bisulfite
439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
444 that the indexing process can take up to several hours; this will mainly depend on genome size
445 and system resources.
446
447
448
449
450 The following is a brief description of command line options and arguments to control the
451 Bismark Genome Preparation script:
452
453
454 USAGE: bismark_genome_preparation [options] <arguments>
455
456
457 OPTIONS:
458
459 --help/--man Displays this help filea and exits.
460
461 --version Displays version information and exits.
462
463 --verbose Print verbose output for more details or debugging.
464
465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If
466 the path </../../> is not provided as an option you will be prompted for it.
467
468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
469
470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
471 single-entry FastA files instead of making one multi-FastA file (MFA)
472 per chromosome. This might be useful if individual bisulfite converted
473 chromosomes are needed (e.g. for debugging), however it can cause a
474 problem with indexing if the number of chromosomes is vast (this is likely
475 to be in the range of several thousand files; the operating system can
476 only handle lists up to a certain length, and some newly assembled
477 genomes may contain 20000-50000 contigs of scaffold files which do exceed
478 this list length limit).
479
480
481 ARGUMENTS:
482
483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
484 At the current time Bismark Genome Preparation expects one or more fastA
485 files in the folder (with the file extension: .fa or .fasta). If the path
486 is not provided as an argument you will be prompted for it.
487
488
489
490 This script was last modified on 18 Nov 2011.
491 HOW_TO
492 }