bismark: bismark_wrapper/bismark_genome

author	bjoern-gruening
date	Tue, 25 Dec 2012 05:52:28 -0500
parents
children

rev	line source
1 183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	1 #!/usr/bin/perl --
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	2 use strict;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	3 use warnings;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	4 use Cwd;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	5 use File::Path qw(rmtree);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	6 $\|++;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	7
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	8
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	9 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	10
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	11 ## This program is free software: you can redistribute it and/or modify
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	12 ## it under the terms of the GNU General Public License as published by
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	13 ## the Free Software Foundation, either version 3 of the License, or
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	14 ## (at your option) any later version.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	15
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	16 ## This program is distributed in the hope that it will be useful,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	19 ## GNU General Public License for more details.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	20
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	21 ## You should have received a copy of the GNU General Public License
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	23
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	24 use Getopt::Long;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	25 use Cwd;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	26
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	27 my $verbose;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	28 my $help;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	29 my $version;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	30 my $man;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	31 my $path_to_bowtie;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	32 my $multi_fasta;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	33 my $single_fasta;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	34 my $bowtie2;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	35
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	36 my $bismark_version = 'v0.7.7';
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	37
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	38 GetOptions ('verbose' => \$verbose,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	39 'help' => \$help,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	40 'man' => \$man,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	41 'version' => \$version,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	42 'path_to_bowtie:s' => \$path_to_bowtie,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	43 'single_fasta' => \$single_fasta,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	44 'bowtie2' => \$bowtie2,
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	45 );
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	46
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	47 my $genome_folder = shift @ARGV; # mandatory
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	48 my $CT_dir;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	49 my $GA_dir;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	50
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	51 if ($help or $man){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	52 print_helpfile();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	53 exit;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	54 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	55
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	56 if ($version){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	57 print << "VERSION";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	58
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	59 Bismark - Bisulfite Mapper and Methylation Caller.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	60
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	61 Bismark Genome Preparation Version: $bismark_version
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	62 Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	63 www.bioinformatics.babraham.ac.uk/projects/
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	64
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	65 VERSION
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	66 exit;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	67 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	68
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	69 if ($single_fasta){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	71 $multi_fasta = 0;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	72 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	73 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	74 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	75 $single_fasta = 0;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	76 $multi_fasta = 1;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	77 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	78
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	79 my @filenames = create_bisulfite_genome_folders();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	80
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	81 process_sequence_files ();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	82
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	83 launch_bowtie_indexer();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	84
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	85 sub launch_bowtie_indexer{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	86 if ($bowtie2){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	87 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	88 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	89 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	90 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	91 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	92 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	93 sleep(5);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	94
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	95 ### if the path to bowtie was specfified explicitely
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	96 if ($path_to_bowtie){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	97 if ($bowtie2){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	98 $path_to_bowtie =~ s/$/bowtie2-build/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	99 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	100 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	101 $path_to_bowtie =~ s/$/bowtie-build/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	102 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	103 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	104 ### otherwise we assume that bowtie-build is in the path
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	105 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	106 if ($bowtie2){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	107 $path_to_bowtie = 'bowtie2-build';
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	108 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	109 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	110 $path_to_bowtie = 'bowtie-build';
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	111 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	112 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	113
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	114 $verbose and print "\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	115
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	116 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	117 my $pid = fork();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	118
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	119 # parent process
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	120 if ($pid){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	121 sleep(1);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	122 chdir $CT_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	123 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	124 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	125 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	126 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	127 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	128
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	129 sleep (11);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	130 exec ("$path_to_bowtie","-f","$file_list","BS_CT");
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	131 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	132
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	133 # child process
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	134 elsif ($pid == 0){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	135 sleep(2);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	136 chdir $GA_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	137 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	138 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	139 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	140 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	141 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	142 $verbose and print "(starting in 10 seconds)\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	143 sleep(10);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	144 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	145 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	146
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	147 # if the platform doesn't support the fork command we will run the indexing processes one after the other
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	148 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	149 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	150 sleep(10);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	151
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	152 ### moving to CT genome folder
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	153 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	154 chdir $CT_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	155 my @fasta_files = <*.fa>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	156 my $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	157 $verbose and print "$file_list\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	158 sleep(2);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	159 system ("$path_to_bowtie","-f","$file_list","BS_CT");
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	160 @fasta_files=();
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	161 $file_list= '';
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	162
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	163 ### moving to GA genome folder
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	164 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	165 chdir $GA_dir or die "Unable to change directory: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	166 @fasta_files = <*.fa>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	167 $file_list = join (',',@fasta_files);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	168 $verbose and print "$file_list\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	169 sleep(2);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	170 exec ("$path_to_bowtie","-f","$file_list","BS_GA");
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	171 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	172 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	173
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	174
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	175 sub process_sequence_files {
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	176
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	177 my ($total_CT_conversions,$total_GA_conversions) = (0,0);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	178 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	179 sleep (3);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	180
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	181 $verbose and print "conversions performed:\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	182 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	183
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	184
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	185 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	186 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	187 ### This is now the default option
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	188
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	189 if ($multi_fasta){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	190 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	191 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	192 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	193
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	194 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	195 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	196 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	197
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	198 foreach my $filename(@filenames){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	199 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	200 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	201 # warn "Reading chromosome information from $filename\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	202
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	203 ### first line needs to be a fastA header
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	204 my $first_line = <IN>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	205 chomp $first_line;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	206
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	207 ### Extracting chromosome name from the FastA header
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	208 my $chromosome_name = extract_chromosome_name($first_line);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	209
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	210 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	211 unless ($multi_fasta){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	212 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	213 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	215
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	216 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	217 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	218 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	219 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	220
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	221 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	222 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	223
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	224
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	225 while (<IN>){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	226
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	227 ### in case the line is a new fastA header
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	228 if ($_ =~ /^>/){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	229 ### printing out the stats for the previous chromosome
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	230 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	231 ### resetting the chromosome transliteration counters
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	232 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	233
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	234 ### Extracting chromosome name from the additional FastA header
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	235 $chromosome_name = extract_chromosome_name($_);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	236
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	237 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	238 unless ($multi_fasta){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	239 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	240 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	242
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	243 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	244 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	245 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	246 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	247
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	248 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	249 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	250 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	251
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	252 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	253 my $sequence = uc$_;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	254
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	255 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	256
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	257 $sequence =~ s/[^ATCGN\n\r]/N/g;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	258
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	259 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	260
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	261 my $CT_sequence = $sequence;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	262 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	263 $total_CT_conversions += $CT_transliterations_performed;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	264 $chromosome_CT_conversions += $CT_transliterations_performed;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	265
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	266 print CT_CONVERT $CT_sequence;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	267
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	268 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	269 ### complementing the forward strand and then C->T converting it)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	270
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	271 my $GA_sequence = $sequence;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	272 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	273 $total_GA_conversions += $GA_transliterations_performed;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	274 $chromosome_GA_conversions += $GA_transliterations_performed;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	275
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	276 print GA_CONVERT $GA_sequence;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	277
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	278 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	279 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	280 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	281 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	282 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	283 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	284
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	285
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	286 print "\nTotal number of conversions performed:\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	287 print "C->T:\t$total_CT_conversions\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	288 print "G->A:\t$total_GA_conversions\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	289
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	290 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	291 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	292
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	293 sub extract_chromosome_name {
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	294
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	295 my $header = shift;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	296
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	297 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	298
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	299 if ($header =~ s/^>//){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	300 my ($chromosome_name) = split (/\s+/,$header);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	301 return $chromosome_name;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	302 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	303 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	304 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	305 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	306 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	307
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	308 sub create_bisulfite_genome_folders{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	309
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	311
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	312 # Ensuring a genome folder has been specified
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	313 if ($genome_folder){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	314 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	315 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	316 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	317 $verbose and print "Path to genome folder specified: $genome_folder\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	319
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	320 # making the genome folder path abolsolute so it won't break if the path was specified relative
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	321 $genome_folder = getcwd;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	322 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	323 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	324 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	325 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	326
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	327 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	328 $verbose and print "Genome folder was not provided as argument ";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	329 while (1){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	330 print "Please specify a genome folder to be bisulfite converted:\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	331 $genome_folder = <STDIN>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	332 chomp $genome_folder;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	333
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	334 # adding a trailing slash unless already present
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	335 unless ($genome_folder =~ /\/$/){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	336 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	337 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	338 if (chdir $genome_folder){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	339 last;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	340 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	341 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	342 warn "Could't move to directory $genome_folder! $!";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	343 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	344 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	345 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	346
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	347 if ($path_to_bowtie){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	348 unless ($path_to_bowtie =~ /\/$/){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	349 $path_to_bowtie =~ s/$/\//;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	350 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	351 if (chdir $path_to_bowtie){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	352 if ($bowtie2){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	353 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	354 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	355 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	356 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	357 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	358 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	359 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	360 die "There was an error with the path to bowtie: $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	361 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	362 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	363
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	364 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	365
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	366
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	367 # Exiting unless there are fastA files in the folder
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	368 my @filenames = <*.fa>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	369
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	370 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	371 unless (@filenames){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	372 @filenames = <*.fasta>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	373 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	374
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	375 unless (@filenames){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	377 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	378
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	380 sleep (3);
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	381
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	384 unless (-d $bisulfite_dir){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	387 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	388 else{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	389 while (1){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	391 my $proceed = <STDIN>;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	392 chomp $proceed;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	393 if ($proceed =~ /^y/i ){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	394 last;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	395 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	396 elsif ($proceed =~ /^n/i){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	397 die "Terminated by user\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	398 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	399 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	400 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	401
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1});
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	407 # }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	408 # }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	409
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	411 $CT_dir = "${bisulfite_dir}CT_conversion/";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	412 $GA_dir = "${bisulfite_dir}GA_conversion/";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	413
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	414 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	415 # converted version of the genome
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	416 unless (-d $CT_dir){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	417 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	418 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	419 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	420 unless (-d $GA_dir){
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	421 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	422 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	423 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	424
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	425 # moving back to the original genome folder
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	426 chdir $genome_folder or die "Could't move to directory $genome_folder $!";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	427 # $verbose and print "Moved back to genome folder folder $genome_folder\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	428 warn "\nStep I - Prepare genome folders - completed\n\n\n";
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	429 return @filenames;
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	430 }
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	431
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	432 sub print_helpfile{
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	433 print << 'HOW_TO';
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	434
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	435
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	436 DESCRIPTION
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	437
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	438 This script is supposed to convert a specified reference genome into two different bisulfite
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	444 that the indexing process can take up to several hours; this will mainly depend on genome size
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	445 and system resources.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	446
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	447
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	448
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	449
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	450 The following is a brief description of command line options and arguments to control the
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	451 Bismark Genome Preparation script:
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	452
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	453
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	454 USAGE: bismark_genome_preparation [options] <arguments>
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	455
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	456
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	457 OPTIONS:
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	458
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	459 --help/--man Displays this help filea and exits.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	460
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	461 --version Displays version information and exits.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	462
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	463 --verbose Print verbose output for more details or debugging.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	464
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	466 the path </../../> is not provided as an option you will be prompted for it.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	467
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	469
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	471 single-entry FastA files instead of making one multi-FastA file (MFA)
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	472 per chromosome. This might be useful if individual bisulfite converted
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	473 chromosomes are needed (e.g. for debugging), however it can cause a
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	474 problem with indexing if the number of chromosomes is vast (this is likely
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	475 to be in the range of several thousand files; the operating system can
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	476 only handle lists up to a certain length, and some newly assembled
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	477 genomes may contain 20000-50000 contigs of scaffold files which do exceed
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	478 this list length limit).
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	479
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	480
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	481 ARGUMENTS:
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	482
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	484 At the current time Bismark Genome Preparation expects one or more fastA
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	485 files in the folder (with the file extension: .fa or .fasta). If the path
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	486 is not provided as an argument you will be prompted for it.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	487
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	488
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	489
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	490 This script was last modified on 18 Nov 2011.
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	491 HOW_TO
183de9d00131 add indices.loc files bjoern-gruening parents: diff changeset	492 }

1

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

1 #!/usr/bin/perl --

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

2 use strict;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

3 use warnings;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

4 use Cwd;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

5 use File::Path qw(rmtree);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

6 $|++;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

7

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

8

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

10

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

11 ## This program is free software: you can redistribute it and/or modify

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

12 ## it under the terms of the GNU General Public License as published by

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

13 ## the Free Software Foundation, either version 3 of the License, or

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

14 ## (at your option) any later version.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

15

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

16 ## This program is distributed in the hope that it will be useful,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

19 ## GNU General Public License for more details.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

20

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

21 ## You should have received a copy of the GNU General Public License

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

23

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

24 use Getopt::Long;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

25 use Cwd;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

26

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

27 my $verbose;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

28 my $help;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

29 my $version;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

30 my $man;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

31 my $path_to_bowtie;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

32 my $multi_fasta;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

33 my $single_fasta;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

34 my $bowtie2;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

35

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

36 my $bismark_version = 'v0.7.7';

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

37

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

38 GetOptions ('verbose' => \$verbose,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

39 'help' => \$help,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

40 'man' => \$man,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

41 'version' => \$version,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

42 'path_to_bowtie:s' => \$path_to_bowtie,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

43 'single_fasta' => \$single_fasta,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

44 'bowtie2' => \$bowtie2,

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

45 );

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

46

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

47 my $genome_folder = shift @ARGV; # mandatory

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

48 my $CT_dir;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

49 my $GA_dir;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

50

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

51 if ($help or $man){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

52 print_helpfile();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

53 exit;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

54 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

55

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

56 if ($version){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

57 print << "VERSION";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

58

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

59 Bismark - Bisulfite Mapper and Methylation Caller.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

60

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

61 Bismark Genome Preparation Version: $bismark_version

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

63 www.bioinformatics.babraham.ac.uk/projects/

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

64

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

65 VERSION

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

66 exit;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

67 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

68

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

69 if ($single_fasta){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

70 print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

71 $multi_fasta = 0;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

72 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

73 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

74 print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

75 $single_fasta = 0;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

76 $multi_fasta = 1;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

77 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

78

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

79 my @filenames = create_bisulfite_genome_folders();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

80

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

81 process_sequence_files ();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

82

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

83 launch_bowtie_indexer();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

84

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

85 sub launch_bowtie_indexer{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

86 if ($bowtie2){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

87 print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

88 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

89 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

90 print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

91 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

92 print "Please be aware that this process can - depending on genome size - take up to several hours!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

93 sleep(5);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

94

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

95 ### if the path to bowtie was specfified explicitely

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

96 if ($path_to_bowtie){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

97 if ($bowtie2){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

98 $path_to_bowtie =~ s/$/bowtie2-build/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

99 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

100 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

101 $path_to_bowtie =~ s/$/bowtie-build/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

102 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

103 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

104 ### otherwise we assume that bowtie-build is in the path

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

105 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

106 if ($bowtie2){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

107 $path_to_bowtie = 'bowtie2-build';

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

108 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

109 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

110 $path_to_bowtie = 'bowtie-build';

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

111 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

112 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

113

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

114 $verbose and print "\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

115

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

116 ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

117 my $pid = fork();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

118

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

119 # parent process

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

120 if ($pid){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

121 sleep(1);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

122 chdir $CT_dir or die "Unable to change directory: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

123 $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

124 my @fasta_files = <*.fa>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

125 my $file_list = join (',',@fasta_files);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

126 $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

127 $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

128

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

129 sleep (11);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

130 exec ("$path_to_bowtie","-f","$file_list","BS_CT");

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

131 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

132

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

133 # child process

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

134 elsif ($pid == 0){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

135 sleep(2);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

136 chdir $GA_dir or die "Unable to change directory: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

137 $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

138 my @fasta_files = <*.fa>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

139 my $file_list = join (',',@fasta_files);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

140 $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

141 $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

142 $verbose and print "(starting in 10 seconds)\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

143 sleep(10);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

144 exec ("$path_to_bowtie","-f","$file_list","BS_GA");

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

145 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

146

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

147 # if the platform doesn't support the fork command we will run the indexing processes one after the other

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

148 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

149 print "Forking process was not successful, therefore performing the indexing sequentially instead\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

150 sleep(10);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

151

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

152 ### moving to CT genome folder

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

153 $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

154 chdir $CT_dir or die "Unable to change directory: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

155 my @fasta_files = <*.fa>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

156 my $file_list = join (',',@fasta_files);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

157 $verbose and print "$file_list\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

158 sleep(2);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

159 system ("$path_to_bowtie","-f","$file_list","BS_CT");

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

160 @fasta_files=();

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

161 $file_list= '';

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

162

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

163 ### moving to GA genome folder

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

164 $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

165 chdir $GA_dir or die "Unable to change directory: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

166 @fasta_files = <*.fa>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

167 $file_list = join (',',@fasta_files);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

168 $verbose and print "$file_list\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

169 sleep(2);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

170 exec ("$path_to_bowtie","-f","$file_list","BS_GA");

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

171 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

172 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

173

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

174

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

175 sub process_sequence_files {

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

176

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

177 my ($total_CT_conversions,$total_GA_conversions) = (0,0);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

178 $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

179 sleep (3);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

180

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

181 $verbose and print "conversions performed:\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

182 $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

183

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

184

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

185 ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

186 ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

187 ### This is now the default option

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

188

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

189 if ($multi_fasta){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

190 ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

191 my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

192 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

193

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

194 my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

195 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

196 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

197

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

198 foreach my $filename(@filenames){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

199 my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

200 open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

201 # warn "Reading chromosome information from $filename\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

202

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

203 ### first line needs to be a fastA header

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

204 my $first_line = <IN>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

205 chomp $first_line;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

206

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

207 ### Extracting chromosome name from the FastA header

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

208 my $chromosome_name = extract_chromosome_name($first_line);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

209

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

210 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

211 unless ($multi_fasta){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

212 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

213 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

214 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

215

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

216 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

217 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

218 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

219 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

220

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

221 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

222 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

223

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

224

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

225 while (<IN>){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

226

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

227 ### in case the line is a new fastA header

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

228 if ($_ =~ /^>/){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

229 ### printing out the stats for the previous chromosome

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

230 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

231 ### resetting the chromosome transliteration counters

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

232 ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

233

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

234 ### Extracting chromosome name from the additional FastA header

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

235 $chromosome_name = extract_chromosome_name($_);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

236

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

237 ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

238 unless ($multi_fasta){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

239 my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

240 $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

241 open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

242

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

243 my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

244 $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

245 open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

246 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

247

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

248 print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

249 print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

250 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

251

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

252 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

253 my $sequence = uc$_;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

254

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

255 ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

256

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

257 $sequence =~ s/[^ATCGN\n\r]/N/g;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

258

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

259 ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

260

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

261 my $CT_sequence = $sequence;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

262 my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

263 $total_CT_conversions += $CT_transliterations_performed;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

264 $chromosome_CT_conversions += $CT_transliterations_performed;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

265

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

266 print CT_CONVERT $CT_sequence;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

267

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

268 ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

269 ### complementing the forward strand and then C->T converting it)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

270

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

271 my $GA_sequence = $sequence;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

272 my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

273 $total_GA_conversions += $GA_transliterations_performed;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

274 $chromosome_GA_conversions += $GA_transliterations_performed;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

275

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

276 print GA_CONVERT $GA_sequence;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

277

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

278 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

279 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

280 $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

281 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

282 close (CT_CONVERT) or die "Failed to close filehandle: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

283 close (GA_CONVERT) or die "Failed to close filehandle: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

284

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

285

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

286 print "\nTotal number of conversions performed:\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

287 print "C->T:\t$total_CT_conversions\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

288 print "G->A:\t$total_GA_conversions\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

289

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

290 warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

291 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

292

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

293 sub extract_chromosome_name {

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

294

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

295 my $header = shift;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

296

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

297 ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

298

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

299 if ($header =~ s/^>//){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

300 my ($chromosome_name) = split (/\s+/,$header);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

301 return $chromosome_name;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

302 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

303 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

304 die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

305 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

306 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

307

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

308 sub create_bisulfite_genome_folders{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

309

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

310 $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

311

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

312 # Ensuring a genome folder has been specified

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

313 if ($genome_folder){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

314 unless ($genome_folder =~ /\/$/){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

315 $genome_folder =~ s/$/\//;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

316 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

317 $verbose and print "Path to genome folder specified: $genome_folder\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

318 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

319

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

320 # making the genome folder path abolsolute so it won't break if the path was specified relative

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

321 $genome_folder = getcwd;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

322 unless ($genome_folder =~ /\/$/){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

323 $genome_folder =~ s/$/\//;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

324 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

325 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

326

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

327 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

328 $verbose and print "Genome folder was not provided as argument ";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

329 while (1){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

330 print "Please specify a genome folder to be bisulfite converted:\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

331 $genome_folder = <STDIN>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

332 chomp $genome_folder;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

333

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

334 # adding a trailing slash unless already present

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

335 unless ($genome_folder =~ /\/$/){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

336 $genome_folder =~ s/$/\//;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

337 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

338 if (chdir $genome_folder){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

339 last;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

340 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

341 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

342 warn "Could't move to directory $genome_folder! $!";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

343 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

344 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

345 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

346

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

347 if ($path_to_bowtie){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

348 unless ($path_to_bowtie =~ /\/$/){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

349 $path_to_bowtie =~ s/$/\//;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

350 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

351 if (chdir $path_to_bowtie){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

352 if ($bowtie2){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

353 $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

354 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

355 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

356 $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

357 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

358 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

359 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

360 die "There was an error with the path to bowtie: $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

361 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

362 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

363

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

364 chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

365

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

366

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

367 # Exiting unless there are fastA files in the folder

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

368 my @filenames = <*.fa>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

369

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

370 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

371 unless (@filenames){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

372 @filenames = <*.fasta>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

373 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

374

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

375 unless (@filenames){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

376 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

377 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

378

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

379 warn "Bisulfite Genome Indexer version $bismark_version (last modified 17 Nov 2011)\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

380 sleep (3);

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

381

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

382 # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

383 my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

384 unless (-d $bisulfite_dir){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

385 mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

386 $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

387 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

388 else{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

389 while (1){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

390 print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indexes might be overwritten!\nDo you want to continue anyway?\t";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

391 my $proceed = <STDIN>;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

392 chomp $proceed;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

393 if ($proceed =~ /^y/i ){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

394 last;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

395 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

396 elsif ($proceed =~ /^n/i){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

397 die "Terminated by user\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

398 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

399 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

400 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

401

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

402 ### as of version 0.6.0 the Bismark indexer will no longer delete the Bisulfite_Genome directory if it was present already, since it could store the Bowtie 1 or 2 indexes already

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

403 # removing any existing files and subfolders in the bisulfite directory (the specified directory won't be deleted)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

404 # rmtree($bisulfite_dir, {verbose => 1,keep_root => 1});

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

405 # unless (-d $bisulfite_dir){ # had to add this after changing remove_tree to rmtree // suggested by Samantha Cooper @ Illumina

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

406 # mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

407 # }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

408 # }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

409

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

410 chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

411 $CT_dir = "${bisulfite_dir}CT_conversion/";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

412 $GA_dir = "${bisulfite_dir}GA_conversion/";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

413

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

414 # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

415 # converted version of the genome

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

416 unless (-d $CT_dir){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

417 mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

418 $verbose and print "Created Bisulfite Genome folder $CT_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

419 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

420 unless (-d $GA_dir){

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

421 mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

422 $verbose and print "Created Bisulfite Genome folder $GA_dir\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

423 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

424

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

425 # moving back to the original genome folder

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

426 chdir $genome_folder or die "Could't move to directory $genome_folder $!";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

427 # $verbose and print "Moved back to genome folder folder $genome_folder\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

428 warn "\nStep I - Prepare genome folders - completed\n\n\n";

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

429 return @filenames;

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

430 }

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

431

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

432 sub print_helpfile{

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

433 print << 'HOW_TO';

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

434

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

435

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

436 DESCRIPTION

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

437

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

438 This script is supposed to convert a specified reference genome into two different bisulfite

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

439 converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

440 bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

441 converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

442 genome folder. Once the bisulfite conversion has been completed the program will fork and launch

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

443 two simultaneous instances of the bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

444 that the indexing process can take up to several hours; this will mainly depend on genome size

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

445 and system resources.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

446

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

447

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

448

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

449

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

450 The following is a brief description of command line options and arguments to control the

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

451 Bismark Genome Preparation script:

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

452

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

453

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

454 USAGE: bismark_genome_preparation [options] <arguments>

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

455

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

456

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

457 OPTIONS:

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

458

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

459 --help/--man Displays this help filea and exits.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

460

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

461 --version Displays version information and exits.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

462

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

463 --verbose Print verbose output for more details or debugging.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

464

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

465 --path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system.If

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

466 the path </../../> is not provided as an option you will be prompted for it.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

467

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

468 --bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

469

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

470 --single_fasta Instruct the Bismark Indexer to write the converted genomes into

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

471 single-entry FastA files instead of making one multi-FastA file (MFA)

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

472 per chromosome. This might be useful if individual bisulfite converted

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

473 chromosomes are needed (e.g. for debugging), however it can cause a

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

474 problem with indexing if the number of chromosomes is vast (this is likely

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

475 to be in the range of several thousand files; the operating system can

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

476 only handle lists up to a certain length, and some newly assembled

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

477 genomes may contain 20000-50000 contigs of scaffold files which do exceed

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

478 this list length limit).

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

479

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

480

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

481 ARGUMENTS:

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

482

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

483 <path_to_genome_folder> The path to the folder containing the genome to be bisulfite converted.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

484 At the current time Bismark Genome Preparation expects one or more fastA

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

485 files in the folder (with the file extension: .fa or .fasta). If the path

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

486 is not provided as an argument you will be prompted for it.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

487

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

488

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

489

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

490 This script was last modified on 18 Nov 2011.

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

491 HOW_TO

183de9d00131 add indices.loc files

bjoern-gruening

parents:

diff changeset

492 }

Mercurial > repos > bjoern-gruening > bismark

annotate bismark_wrapper/bismark_genome_preparation @ 1:183de9d00131 draft