Mercurial > repos > bjoern-gruening > bismark
comparison bismark_genome_preparation @ 0:36d124f44c0a draft
inital commit
author | bjoern-gruening |
---|---|
date | Tue, 25 Dec 2012 05:45:46 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36d124f44c0a |
---|---|
1 #!/usr/bin/perl -- | |
2 use strict; | |
3 use warnings; | |
4 use Cwd; | |
5 use File::Path qw(rmtree); | |
6 $|++; | |
7 | |
8 | |
9 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk) | |
10 | |
11 ## This program is free software: you can redistribute it and/or modify | |
12 ## it under the terms of the GNU General Public License as published by | |
13 ## the Free Software Foundation, either version 3 of the License, or | |
14 ## (at your option) any later version. | |
15 | |
16 ## This program is distributed in the hope that it will be useful, | |
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 ## GNU General Public License for more details. | |
20 | |
21 ## You should have received a copy of the GNU General Public License | |
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
23 | |
24 use Getopt::Long; | |
25 use Cwd; | |
26 | |
27 my $verbose; | |
28 my $help; | |
29 my $version; | |
30 my $man; | |
31 my $path_to_bowtie; | |
32 my $multi_fasta; | |
33 my $single_fasta; | |
34 my $bowtie2; | |
35 | |
36 my $bismark_version = 'v0.7.7'; | |
37 | |
38 GetOptions ('verbose' => \$verbose, | |
39 'help' => \$help, | |
40 'man' => \$man, | |
41 'version' => \$version, | |
42 'path_to_bowtie:s' => \$path_to_bowtie, | |
43 'single_fasta' => \$single_fasta, | |
44 'bowtie2' => \$bowtie2, | |
45 ); | |
46 | |
47 my $genome_folder = shift @ARGV; # mandatory | |
48 my $CT_dir; | |
49 my $GA_dir; | |
50 | |
51 if ($help or $man){ | |
52 print_helpfile(); | |
53 exit; | |
54 } | |
55 | |
56 if ($version){ | |
57 print << "VERSION"; | |
58 | |
59 Bismark - Bisulfite Mapper and Methylation Caller. | |
60 | |
61 Bismark Genome Preparation Version: $bismark_version | |
62 Copyright 2010-12 Felix Krueger, Babraham Bioinformatics | |
63 www.bioinformatics.babraham.ac.uk/projects/ | |
64 | |
65 VERSION | |
66 exit; | |
67 } | |
68 | |
69 if ($single_fasta){ | |
70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; | |
71 $multi_fasta = 0; | |
72 } | |
73 else{ | |
74 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; | |
75 $single_fasta = 0; | |
76 $multi_fasta = 1; | |
77 } | |
78 | |
79 my @filenames = create_bisulfite_genome_folders(); | |
80 | |
81 process_sequence_files (); | |
82 | |
83 launch_bowtie_indexer(); | |
84 | |
85 sub launch_bowtie_indexer{ | |
86 if ($bowtie2){ | |
87 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; | |
88 } | |
89 else{ | |
90 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; | |
91 } | |
92 print "Please be aware that this process can - depending on genome size - take up to several hours!\n"; | |
93 sleep(5); | |
94 | |
95 ### if the path to bowtie was specfified explicitely | |
96 if ($path_to_bowtie){ | |
97 if ($bowtie2){ | |
98 $path_to_bowtie =~ s/$/bowtie2-build/; | |
99 } | |
100 else{ | |
101 $path_to_bowtie =~ s/$/bowtie-build/; | |
102 } | |
103 } | |
104 ### otherwise we assume that bowtie-build is in the path | |
105 else{ | |
106 if ($bowtie2){ | |
107 $path_to_bowtie = 'bowtie2-build'; | |
108 } | |
109 else{ | |
110 $path_to_bowtie = 'bowtie-build'; | |
111 } | |
112 } | |
113 | |
114 $verbose and print "\n"; | |
115 | |
116 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) | |
117 my $pid = fork(); | |
118 | |
119 # parent process | |
120 if ($pid){ | |
121 sleep(1); | |
122 chdir $CT_dir or die "Unable to change directory: $!\n"; | |
123 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; | |
124 my @fasta_files = <*.fa>; | |
125 my $file_list = join (',',@fasta_files); | |
126 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; | |
127 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; | |
128 | |
129 sleep (11); | |
130 exec ("$path_to_bowtie","-f","$file_list","BS_CT"); | |
131 } | |
132 | |
133 # child process | |
134 elsif ($pid == 0){ | |
135 sleep(2); | |
136 chdir $GA_dir or die "Unable to change directory: $!\n"; | |
137 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; | |
138 my @fasta_files = <*.fa>; | |
139 my $file_list = join (',',@fasta_files); | |
140 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; | |
141 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; | |
142 $verbose and print "(starting in 10 seconds)\n"; | |
143 sleep(10); | |
144 exec ("$path_to_bowtie","-f","$file_list","BS_GA"); | |
145 } | |
146 | |
147 # if the platform doesn't support the fork command we will run the indexing processes one after the other | |
148 else{ | |
149 print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; | |
150 sleep(10); | |
151 | |
152 ### moving to CT genome folder | |
153 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; | |
154 chdir $CT_dir or die "Unable to change directory: $!\n"; | |
155 my @fasta_files = <*.fa>; | |
156 my $file_list = join (',',@fasta_files); | |
157 $verbose and print "$file_list\n\n"; | |
158 sleep(2); | |
159 system ("$path_to_bowtie","-f","$file_list","BS_CT"); | |
160 @fasta_files=(); | |
161 $file_list= ''; | |
162 | |
163 ### moving to GA genome folder | |
164 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; | |
165 chdir $GA_dir or die "Unable to change directory: $!\n"; | |
166 @fasta_files = <*.fa>; | |
167 $file_list = join (',',@fasta_files); | |
168 $verbose and print "$file_list\n\n"; | |
169 sleep(2); | |
170 exec ("$path_to_bowtie","-f","$file_list","BS_GA"); | |
171 } | |
172 } | |
173 | |
174 | |
175 sub process_sequence_files { | |
176 | |
177 my ($total_CT_conversions,$total_GA_conversions) = (0,0); | |
178 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; | |
179 sleep (3); | |
180 | |
181 $verbose and print "conversions performed:\n"; | |
182 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; | |
183 | |
184 | |
185 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file | |
186 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle | |
187 ### This is now the default option | |
188 | |
189 if ($multi_fasta){ | |
190 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files | |
191 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; | |
192 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
193 | |
194 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; | |
195 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
196 } | |
197 | |
198 foreach my $filename(@filenames){ | |
199 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); | |
200 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; | |
201 # warn "Reading chromosome information from $filename\n\n"; | |
202 | |
203 ### first line needs to be a fastA header | |
204 my $first_line = <IN>; | |
205 chomp $first_line; | |
206 | |
207 ### Extracting chromosome name from the FastA header | |
208 my $chromosome_name = extract_chromosome_name($first_line); | |
209 | |
210 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. | |
211 unless ($multi_fasta){ | |
212 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; | |
213 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; | |
214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
215 | |
216 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; | |
217 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; | |
218 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
219 } | |
220 | |
221 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry | |
222 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry | |
223 | |
224 | |
225 while (<IN>){ | |
226 | |
227 ### in case the line is a new fastA header | |
228 if ($_ =~ /^>/){ | |
229 ### printing out the stats for the previous chromosome | |
230 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; | |
231 ### resetting the chromosome transliteration counters | |
232 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); | |
233 | |
234 ### Extracting chromosome name from the additional FastA header | |
235 $chromosome_name = extract_chromosome_name($_); | |
236 | |
237 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. | |
238 unless ($multi_fasta){ | |
239 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; | |
240 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; | |
241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; | |
242 | |
243 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; | |
244 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; | |
245 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; | |
246 } | |
247 | |
248 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; | |
249 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; | |
250 } | |
251 | |
252 else{ | |
253 my $sequence = uc$_; | |
254 | |
255 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) | |
256 | |
257 $sequence =~ s/[^ATCGN\n\r]/N/g; | |
258 | |
259 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) | |
260 | |
261 my $CT_sequence = $sequence; | |
262 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts | |
263 $total_CT_conversions += $CT_transliterations_performed; | |
264 $chromosome_CT_conversions += $CT_transliterations_performed; | |
265 | |
266 print CT_CONVERT $CT_sequence; | |
267 | |
268 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- | |
269 ### complementing the forward strand and then C->T converting it) | |
270 | |
271 my $GA_sequence = $sequence; | |
272 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand | |
273 $total_GA_conversions += $GA_transliterations_performed; | |
274 $chromosome_GA_conversions += $GA_transliterations_performed; | |
275 | |
276 print GA_CONVERT $GA_sequence; | |
277 | |
278 } | |
279 } | |
280 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; | |
281 } | |
282 close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; | |
283 close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; | |
284 | |
285 | |
286 print "\nTotal number of conversions performed:\n"; | |
287 print "C->T:\t$total_CT_conversions\n"; | |
288 print "G->A:\t$total_GA_conversions\n"; | |
289 | |
290 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; | |
291 } | |
292 | |
293 sub extract_chromosome_name { | |
294 | |
295 my $header = shift; | |
296 | |
297 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well | |
298 | |
299 if ($header =~ s/^>//){ | |
300 my ($chromosome_name) = split (/\s+/,$header); | |
301 return $chromosome_name; | |
302 } | |
303 else{ | |
304 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n"; | |
305 } | |
306 } | |
307 | |
308 sub create_bisulfite_genome_folders{ | |
309 | |
310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; | |
311 | |
312 # Ensuring a genome folder has been specified | |
313 if ($genome_folder){ | |
314 unless ($genome_folder =~ /\/$/){ | |
315 $genome_folder =~ s/$/\//; | |
316 } | |
317 $verbose and print "Path to genome folder specified: $genome_folder\n"; | |
318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; | |
319 | |
320 # making the genome folder path abolsolute so it won't break if the path was specified relative | |
321 $genome_folder = getcwd; | |
322 unless ($genome_folder =~ /\/$/){ | |
323 $genome_folder =~ s/$/\//; | |
324 } | |
325 } | |
326 | |
327 else{ | |
328 $verbose and print "Genome folder was not provided as argument "; | |
329 while (1){ | |
330 print "Please specify a genome folder to be bisulfite converted:\n"; | |
331 $genome_folder = <STDIN>; | |
332 chomp $genome_folder; | |
333 | |
334 # adding a trailing slash unless already present | |
335 unless ($genome_folder =~ /\/$/){ | |
336 $genome_folder =~ s/$/\//; | |
337 } | |
338 if (chdir $genome_folder){ | |
339 last; | |
340 } | |
341 else{ | |
342 warn "Could't move to directory $genome_folder! $!"; | |
343 } | |
344 } | |
345 } | |
346 | |
347 if ($path_to_bowtie){ | |
348 unless ($path_to_bowtie =~ /\/$/){ | |
349 $path_to_bowtie =~ s/$/\//; | |
350 } | |
351 if (chdir $path_to_bowtie){ | |
352 if ($bowtie2){ | |
353 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; | |
354 } | |
355 else{ | |
356 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; | |
357 } | |
358 } | |
359 else{ | |
360 die "There was an error with the path to bowtie: $!\n"; | |
361 } | |
362 } | |
363 | |
364 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; | |
365 | |
366 | |
367 # Exiting unless there are fastA files in the folder | |
368 my @filenames = <*.fa>; | |
369 | |
370 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
371 unless (@filenames){ | |
372 @filenames = <*.fasta>; | |
373 } | |
374 | |
375 unless (@filenames){ | |
376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; | |
377 } | |
378 | |
379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n"; | |
380 sleep (3); | |
381 | |
382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists | |
383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; | |
384 unless (-d $bisulfite_dir){ | |
385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; | |
386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; | |
387 } | |
388 else{ | |
389 while (1){ | |
390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t"; | |
391 my $proceed = <STDIN>; | |
392 chomp $proceed; | |
393 if ($proceed =~ /^y/i ){ | |
394 last; | |
395 } | |
396 elsif ($proceed =~ /^n/i){ | |
397 die "Terminated by user\n\n"; | |
398 } | |
399 } | |
400 } | |
401 | |
402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already | |
403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted) | |
404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1}); | |
405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina | |
406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; | |
407 # } | |
408 # } | |
409 | |
410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; | |
411 $CT_dir = "${bisulfite_dir}CT_conversion/"; | |
412 $GA_dir = "${bisulfite_dir}GA_conversion/"; | |
413 | |
414 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) | |
415 # converted version of the genome | |
416 unless (-d $CT_dir){ | |
417 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; | |
418 $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; | |
419 } | |
420 unless (-d $GA_dir){ | |
421 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; | |
422 $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; | |
423 } | |
424 | |
425 # moving back to the original genome folder | |
426 chdir $genome_folder or die "Could't move to directory $genome_folder $!"; | |
427 # $verbose and print "Moved back to genome folder folder $genome_folder\n"; | |
428 warn "\nStep I - Prepare genome folders - completed\n\n\n"; | |
429 return @filenames; | |
430 } | |
431 | |
432 sub print_helpfile{ | |
433 print << 'HOW_TO'; | |
434 | |
435 | |
436 DESCRIPTION | |
437 | |
438 This script is supposed to convert a specified reference genome into two different bisulfite | |
439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first | |
440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs | |
441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference | |
442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch | |
443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware | |
444 that the indexing process can take up to several hours; this will mainly depend on genome size | |
445 and system resources. | |
446 | |
447 | |
448 | |
449 | |
450 The following is a brief description of command line options and arguments to control the | |
451 Bismark Genome Preparation script: | |
452 | |
453 | |
454 USAGE: bismark_genome_preparation [options] <arguments> | |
455 | |
456 | |
457 OPTIONS: | |
458 | |
459 --help/--man Displays this help filea and exits. | |
460 | |
461 --version Displays version information and exits. | |
462 | |
463 --verbose Print verbose output for more details or debugging. | |
464 | |
465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If | |
466 the path </../../> is not provided as an option you will be prompted for it. | |
467 | |
468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1). | |
469 | |
470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into | |
471 single-entry FastA files instead of making one multi-FastA file (MFA) | |
472 per chromosome. This might be useful if individual bisulfite converted | |
473 chromosomes are needed (e.g. for debugging), however it can cause a | |
474 problem with indexing if the number of chromosomes is vast (this is likely | |
475 to be in the range of several thousand files; the operating system can | |
476 only handle lists up to a certain length, and some newly assembled | |
477 genomes may contain 20000-50000 contigs of scaffold files which do exceed | |
478 this list length limit). | |
479 | |
480 | |
481 ARGUMENTS: | |
482 | |
483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted. | |
484 At the current time Bismark Genome Preparation expects one or more fastA | |
485 files in the folder (with the file extension: .fa or .fasta). If the path | |
486 is not provided as an argument you will be prompted for it. | |
487 | |
488 | |
489 | |
490 This script was last modified on 18 Nov 2011. | |
491 HOW_TO | |
492 } |