annotate methylation_analysis_bismark/methylation_analysis/bismark @ 10:2432df265dad draft

Uploaded
author fcaramia
date Wed, 12 Dec 2012 19:45:04 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1 #!/usr/bin/perl --
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2 use strict;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3 use warnings;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4 use IO::Handle;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5 use Cwd;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6 $|++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
7 use Getopt::Long;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
8
2432df265dad Uploaded
fcaramia
parents:
diff changeset
9
2432df265dad Uploaded
fcaramia
parents:
diff changeset
10 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@bbsrc.ac.uk)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
11
2432df265dad Uploaded
fcaramia
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
2432df265dad Uploaded
fcaramia
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
2432df265dad Uploaded
fcaramia
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
15 ## (at your option) any later version.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
16
2432df265dad Uploaded
fcaramia
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
2432df265dad Uploaded
fcaramia
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
20 ## GNU General Public License for more details.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
21
2432df265dad Uploaded
fcaramia
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
2432df265dad Uploaded
fcaramia
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
24
2432df265dad Uploaded
fcaramia
parents:
diff changeset
25
2432df265dad Uploaded
fcaramia
parents:
diff changeset
26 my $parent_dir = getcwd;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
27 my $bismark_version = 'v0.7.6';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
29
2432df265dad Uploaded
fcaramia
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
2432df265dad Uploaded
fcaramia
parents:
diff changeset
31 foreach my $arg (@ARGV){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
33 $arg = '--phred64-quals';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
34 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
35 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
37
2432df265dad Uploaded
fcaramia
parents:
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
39
2432df265dad Uploaded
fcaramia
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
42 my %counting; # counting various events
2432df265dad Uploaded
fcaramia
parents:
diff changeset
43
2432df265dad Uploaded
fcaramia
parents:
diff changeset
44 my $seqID_contains_tabs;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
45
2432df265dad Uploaded
fcaramia
parents:
diff changeset
46 foreach my $filename (@filenames){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
47
2432df265dad Uploaded
fcaramia
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
49 ### resetting the counting hash and fhs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
50 reset_counters_and_fhs($filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
51 $seqID_contains_tabs = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
52
2432df265dad Uploaded
fcaramia
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
54 if ($filename =~ ','){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
56
2432df265dad Uploaded
fcaramia
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
61
2432df265dad Uploaded
fcaramia
parents:
diff changeset
62 print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
63
2432df265dad Uploaded
fcaramia
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
65 print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
66
2432df265dad Uploaded
fcaramia
parents:
diff changeset
67 ### additional variables only for paired-end alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
69
2432df265dad Uploaded
fcaramia
parents:
diff changeset
70 ### FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
72 print "Input files are in FastA format\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
73
2432df265dad Uploaded
fcaramia
parents:
diff changeset
74 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
77
2432df265dad Uploaded
fcaramia
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
86 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
87 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
90
2432df265dad Uploaded
fcaramia
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
99 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
100
2432df265dad Uploaded
fcaramia
parents:
diff changeset
101 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
103 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
104 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
106 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
107 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
108
2432df265dad Uploaded
fcaramia
parents:
diff changeset
109 ### FastQ format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
110 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
111 print "Input files are in FastQ format\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
112 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
113 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
114 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
115
2432df265dad Uploaded
fcaramia
parents:
diff changeset
116 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
117 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
118 $fhs[1]->{inputfile_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
119 $fhs[1]->{inputfile_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
120 $fhs[2]->{inputfile_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
121 $fhs[2]->{inputfile_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
122 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
123 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
124 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
125 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
126 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
127 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
128
2432df265dad Uploaded
fcaramia
parents:
diff changeset
129 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
130 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
131 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
132 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
133 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
134 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
135 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
136 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
137 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
138
2432df265dad Uploaded
fcaramia
parents:
diff changeset
139 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
140 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
141 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
142 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
143 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
144 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
145 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
146 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
147 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
148
2432df265dad Uploaded
fcaramia
parents:
diff changeset
149 ### Else we are performing SINGLE-END ALIGNMENTS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
150 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
151 print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
152 ### Initialising bisulfite conversion filenames
2432df265dad Uploaded
fcaramia
parents:
diff changeset
153 my ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
154
2432df265dad Uploaded
fcaramia
parents:
diff changeset
155
2432df265dad Uploaded
fcaramia
parents:
diff changeset
156 ### FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
157 if ($sequence_file_format eq 'FASTA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
158 print "Inut file is in FastA format\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
159 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
160 ($C_to_T_infile) = biTransformFastAFiles ($filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
161 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
162 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
163 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
164 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
165 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
166 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
167 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
168
2432df265dad Uploaded
fcaramia
parents:
diff changeset
169 ### Creating 4 different bowtie filehandles and storing the first entry
2432df265dad Uploaded
fcaramia
parents:
diff changeset
170 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
171 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
172 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
173 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
174 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
175 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
176 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
177
2432df265dad Uploaded
fcaramia
parents:
diff changeset
178 ## FastQ format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
179 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
180 print "Input file is in FastQ format\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
181 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
182 ($C_to_T_infile) = biTransformFastQFiles ($filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
183 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
184 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
185 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
186 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
187 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
188 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
189 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
190
2432df265dad Uploaded
fcaramia
parents:
diff changeset
191 ### Creating 4 different bowtie filehandles and storing the first entry
2432df265dad Uploaded
fcaramia
parents:
diff changeset
192 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
193 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
194 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
195 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
196 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
197 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
198 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
199
2432df265dad Uploaded
fcaramia
parents:
diff changeset
200 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
201
2432df265dad Uploaded
fcaramia
parents:
diff changeset
202 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
203 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
204
2432df265dad Uploaded
fcaramia
parents:
diff changeset
205 sub start_methylation_call_procedure_single_ends {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
206 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
207 my ($dir,$filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
208
2432df265dad Uploaded
fcaramia
parents:
diff changeset
209 if ($sequence_file =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
210 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
211 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
212 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
213 $filename = $sequence_file;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
214 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
215
2432df265dad Uploaded
fcaramia
parents:
diff changeset
216 ### printing all alignments to a results file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
217 my $outfile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
218
2432df265dad Uploaded
fcaramia
parents:
diff changeset
219 if ($bowtie2){ # SAM format is the default for Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
220 $outfile =~ s/$/_bt2_bismark.sam/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
221 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
222 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
223 $outfile =~ s/$/_bismark.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
224 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
225 else{ # SAM is the default output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
226 $outfile =~ s/$/_bismark.sam/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
227 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
228 print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
229 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
230 if ($vanilla){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
231 print OUT "Bismark version: $bismark_version\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
232 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
233
2432df265dad Uploaded
fcaramia
parents:
diff changeset
234 ### printing alignment and methylation call summary to a report file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
235 my $reportfile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
236 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
237 $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
238 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
239 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
240 $reportfile =~ s/$/_Bismark_mapping_report.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
241 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
242
2432df265dad Uploaded
fcaramia
parents:
diff changeset
243 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
244 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
245
2432df265dad Uploaded
fcaramia
parents:
diff changeset
246 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
247 my $unmapped_file = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
248 $unmapped_file =~ s/$/_unmapped_reads.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
249 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
250 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
251 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
252 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
253 my $ambiguous_file = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
254 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
255 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
256 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
257 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
258
2432df265dad Uploaded
fcaramia
parents:
diff changeset
259 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
260 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
261 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
262 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
263
2432df265dad Uploaded
fcaramia
parents:
diff changeset
264
2432df265dad Uploaded
fcaramia
parents:
diff changeset
265 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
2432df265dad Uploaded
fcaramia
parents:
diff changeset
266 unless (%chromosomes){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
267 my $cwd = getcwd; # storing the path of the current working directory
2432df265dad Uploaded
fcaramia
parents:
diff changeset
268 print "Current working directory is: $cwd\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
269 read_genome_into_memory($cwd);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
270 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
271
2432df265dad Uploaded
fcaramia
parents:
diff changeset
272 unless ($vanilla or $sam_no_hd){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
273 generate_SAM_header();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
274 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
275
2432df265dad Uploaded
fcaramia
parents:
diff changeset
276 ### Input file is in FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
277 if ($sequence_file_format eq 'FASTA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
278 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
279 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
280 ### Input file is in FastQ format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
281 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
282 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
283 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
284 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
285
2432df265dad Uploaded
fcaramia
parents:
diff changeset
286 sub start_methylation_call_procedure_paired_ends {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
287 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
288
2432df265dad Uploaded
fcaramia
parents:
diff changeset
289 my ($dir_1,$filename_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
290
2432df265dad Uploaded
fcaramia
parents:
diff changeset
291 if ($sequence_file_1 =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
292 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
293 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
294 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
295 $filename_1 = $sequence_file_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
296 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
297
2432df265dad Uploaded
fcaramia
parents:
diff changeset
298 my ($dir_2,$filename_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
299
2432df265dad Uploaded
fcaramia
parents:
diff changeset
300 if ($sequence_file_2 =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
301 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
302 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
303 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
304 $filename_2 = $sequence_file_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
305 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
306
2432df265dad Uploaded
fcaramia
parents:
diff changeset
307 ### printing all alignments to a results file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
308 my $outfile = $filename_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
309 if ($bowtie2){ # SAM format is the default Bowtie 2 output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
310 $outfile =~ s/$/_bismark_bt2_pe.sam/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
311 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
312 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
313 $outfile =~ s/$/_bismark_pe.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
314 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
315 else{ # SAM format is the default Bowtie 1 output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
316 $outfile =~ s/$/_bismark_pe.sam/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
317 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
318
2432df265dad Uploaded
fcaramia
parents:
diff changeset
319 print "Writing bisulfite mapping results to $outfile\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
320 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
321 if ($vanilla){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
322 print OUT "Bismark version: $bismark_version\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
323 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
324
2432df265dad Uploaded
fcaramia
parents:
diff changeset
325 ### printing alignment and methylation call summary to a report file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
326 my $reportfile = $filename_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
327 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
328 $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
329 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
330 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
331 $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
332 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
333
2432df265dad Uploaded
fcaramia
parents:
diff changeset
334 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
335 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
336 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
337
2432df265dad Uploaded
fcaramia
parents:
diff changeset
338
2432df265dad Uploaded
fcaramia
parents:
diff changeset
339 ### Unmapped read output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
340 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
341 my $unmapped_1 = $filename_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
342 my $unmapped_2 = $filename_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
343 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
344 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
345 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
346 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
347 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
348 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
349
2432df265dad Uploaded
fcaramia
parents:
diff changeset
350 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
351 my $amb_1 = $filename_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
352 my $amb_2 = $filename_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
353 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
354 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
355 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
356 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
357 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
358 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
359
2432df265dad Uploaded
fcaramia
parents:
diff changeset
360 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
361 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
362 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
363
2432df265dad Uploaded
fcaramia
parents:
diff changeset
364 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
2432df265dad Uploaded
fcaramia
parents:
diff changeset
365 unless (%chromosomes){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
366 my $cwd = getcwd; # storing the path of the current working directory
2432df265dad Uploaded
fcaramia
parents:
diff changeset
367 print "Current working directory is: $cwd\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
368 read_genome_into_memory($cwd);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
369 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
370
2432df265dad Uploaded
fcaramia
parents:
diff changeset
371 unless ($vanilla or $sam_no_hd){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
372 generate_SAM_header();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
373 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
374
2432df265dad Uploaded
fcaramia
parents:
diff changeset
375 ### Input files are in FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
376 if ($sequence_file_format eq 'FASTA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
377 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
378 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
379 ### Input files are in FastQ format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
380 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
381 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
382 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
383 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
384
2432df265dad Uploaded
fcaramia
parents:
diff changeset
385 sub print_final_analysis_report_single_end{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
386 my ($C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
387 ### All sequences from the original sequence file have been analysed now
2432df265dad Uploaded
fcaramia
parents:
diff changeset
388 ### deleting temporary C->T or G->A infiles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
389
2432df265dad Uploaded
fcaramia
parents:
diff changeset
390 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
391 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
392 if ($deletion_successful == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
393 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
394 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
395 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
396 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
397 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
398 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
399
2432df265dad Uploaded
fcaramia
parents:
diff changeset
400 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
401 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
402 if ($deletion_successful == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
403 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
404 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
405 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
406 warn "Could not delete temporary files properly $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
407 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
408 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
409
2432df265dad Uploaded
fcaramia
parents:
diff changeset
410 ### printing a final report for the alignment procedure
2432df265dad Uploaded
fcaramia
parents:
diff changeset
411 print REPORT "Final Alignment report\n",'='x22,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
412 print "Final Alignment report\n",'='x22,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
413 # foreach my $index (0..$#fhs){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
414 # print "$fhs[$index]->{name}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
415 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
416 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
417 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
418
2432df265dad Uploaded
fcaramia
parents:
diff changeset
419 ### printing a final report for the methylation call procedure
2432df265dad Uploaded
fcaramia
parents:
diff changeset
420 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
421 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
422
2432df265dad Uploaded
fcaramia
parents:
diff changeset
423 my $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
424
2432df265dad Uploaded
fcaramia
parents:
diff changeset
425 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
426 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
427
2432df265dad Uploaded
fcaramia
parents:
diff changeset
428 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
2432df265dad Uploaded
fcaramia
parents:
diff changeset
429 ### only calculating the percentage if there were any overruled alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
430 if ($counting{low_complexity_alignments_overruled_count}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
431 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
432 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
433 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
434
2432df265dad Uploaded
fcaramia
parents:
diff changeset
435 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
436 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
437 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
438 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
439 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
440
2432df265dad Uploaded
fcaramia
parents:
diff changeset
441 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
442 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
443 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
444 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
445 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
446
2432df265dad Uploaded
fcaramia
parents:
diff changeset
447 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
448 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
449 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
450 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
451
2432df265dad Uploaded
fcaramia
parents:
diff changeset
452 ### detailed information about Cs analysed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
453 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
454 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
455 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
456 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
457 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
458 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
459 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
460 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
461 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
462
2432df265dad Uploaded
fcaramia
parents:
diff changeset
463 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
464 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
465 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
466 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
467 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
468 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
469 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
470 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
471
2432df265dad Uploaded
fcaramia
parents:
diff changeset
472 my $percent_meCHG;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
473 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
474 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
475 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
476
2432df265dad Uploaded
fcaramia
parents:
diff changeset
477 my $percent_meCHH;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
478 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
479 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
480 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
481
2432df265dad Uploaded
fcaramia
parents:
diff changeset
482 my $percent_meCpG;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
483 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
484 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
485 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
486
2432df265dad Uploaded
fcaramia
parents:
diff changeset
487 ### printing methylated CpG percentage if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
488 if ($percent_meCpG){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
489 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
490 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
491 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
492 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
493 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
494 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
495 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
496
2432df265dad Uploaded
fcaramia
parents:
diff changeset
497 ### printing methylated C percentage (CHG context) if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
498 if ($percent_meCHG){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
499 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
500 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
501 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
502 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
503 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
504 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
505 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
506
2432df265dad Uploaded
fcaramia
parents:
diff changeset
507 ### printing methylated C percentage (CHH context) if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
508 if ($percent_meCHH){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
509 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
510 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
511 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
512 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
513 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
514 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
515 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
516
2432df265dad Uploaded
fcaramia
parents:
diff changeset
517 if ($seqID_contains_tabs){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
518 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
519 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
520 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
521 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
522
2432df265dad Uploaded
fcaramia
parents:
diff changeset
523 sub print_final_analysis_report_paired_ends{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
524 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
525 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
526 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
527 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
528 if ($deletion_successful == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
529 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
530 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
531 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
532 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
533 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
534 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
535 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
536 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
537 if ($deletion_successful == 4){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
538 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
539 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
540 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
541 warn "Could not delete temporary files properly: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
542 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
543 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
544
2432df265dad Uploaded
fcaramia
parents:
diff changeset
545 ### printing a final report for the alignment procedure
2432df265dad Uploaded
fcaramia
parents:
diff changeset
546 warn "Final Alignment report\n",'='x22,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
547 print REPORT "Final Alignment report\n",'='x22,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
548 # foreach my $index (0..$#fhs){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
549 # print "$fhs[$index]->{name}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
550 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
551 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
552 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
553
2432df265dad Uploaded
fcaramia
parents:
diff changeset
554 ### printing a final report for the methylation call procedure
2432df265dad Uploaded
fcaramia
parents:
diff changeset
555 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
556 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
557
2432df265dad Uploaded
fcaramia
parents:
diff changeset
558 my $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
559 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
560 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
561
2432df265dad Uploaded
fcaramia
parents:
diff changeset
562 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
563 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
564 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
565 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
566 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
567
2432df265dad Uploaded
fcaramia
parents:
diff changeset
568
2432df265dad Uploaded
fcaramia
parents:
diff changeset
569 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
570 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
571 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
572 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
573 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
574 ### detailed information about Cs analysed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
575
2432df265dad Uploaded
fcaramia
parents:
diff changeset
576 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
577 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
578 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
579 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
580
2432df265dad Uploaded
fcaramia
parents:
diff changeset
581 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
582 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
583
2432df265dad Uploaded
fcaramia
parents:
diff changeset
584 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
585 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
586 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
587 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
588 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
589 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
590 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
591 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
592
2432df265dad Uploaded
fcaramia
parents:
diff changeset
593 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
594 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
595 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
596 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
597 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
598 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
599 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
600
2432df265dad Uploaded
fcaramia
parents:
diff changeset
601 my $percent_meCHG;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
602 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
603 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
604 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
605
2432df265dad Uploaded
fcaramia
parents:
diff changeset
606 my $percent_meCHH;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
607 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
608 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
609 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
610
2432df265dad Uploaded
fcaramia
parents:
diff changeset
611 my $percent_meCpG;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
612 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
613 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
614 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
615
2432df265dad Uploaded
fcaramia
parents:
diff changeset
616 ### printing methylated CpG percentage if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
617 if ($percent_meCpG){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
618 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
619 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
620 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
621 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
622 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
623 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
624 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
625
2432df265dad Uploaded
fcaramia
parents:
diff changeset
626 ### printing methylated C percentage in CHG context if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
627 if ($percent_meCHG){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
628 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
629 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
630 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
631 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
632 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
633 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
634 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
635
2432df265dad Uploaded
fcaramia
parents:
diff changeset
636 ### printing methylated C percentage in CHH context if applicable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
637 if ($percent_meCHH){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
638 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
639 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
640 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
641 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
642 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
643 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
644 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
645
2432df265dad Uploaded
fcaramia
parents:
diff changeset
646 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
647
2432df265dad Uploaded
fcaramia
parents:
diff changeset
648 sub process_single_end_fastA_file_for_methylation_call{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
649 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
650 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
651 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
2432df265dad Uploaded
fcaramia
parents:
diff changeset
652 ### the C->T or G->A version
2432df265dad Uploaded
fcaramia
parents:
diff changeset
653
2432df265dad Uploaded
fcaramia
parents:
diff changeset
654 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
655 if ($sequence_file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
656 open (IN,"zcat $sequence_file |") or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
657 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
658 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
659 open (IN,$sequence_file) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
660 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
661
2432df265dad Uploaded
fcaramia
parents:
diff changeset
662 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
663
2432df265dad Uploaded
fcaramia
parents:
diff changeset
664 warn "\nReading in the sequence file $sequence_file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
665 while (1) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
666 # last if ($counting{sequences_count} > 100);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
667 my $identifier = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
668 my $sequence = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
669 last unless ($identifier and $sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
670
2432df265dad Uploaded
fcaramia
parents:
diff changeset
671 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
672
2432df265dad Uploaded
fcaramia
parents:
diff changeset
673 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
674
2432df265dad Uploaded
fcaramia
parents:
diff changeset
675 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
676 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
677 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
678 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
679 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
680 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
681
2432df265dad Uploaded
fcaramia
parents:
diff changeset
682 $counting{sequences_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
683 if ($counting{sequences_count}%100000==0) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
684 warn "Processed $counting{sequences_count} sequences so far\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
685 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
686 chomp $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
687 chomp $identifier;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
688
2432df265dad Uploaded
fcaramia
parents:
diff changeset
689 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
2432df265dad Uploaded
fcaramia
parents:
diff changeset
690
2432df265dad Uploaded
fcaramia
parents:
diff changeset
691 my $return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
692 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
693 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
694 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
695 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
696 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
697 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
698
2432df265dad Uploaded
fcaramia
parents:
diff changeset
699 unless ($return){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
700 $return = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
701 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
702
2432df265dad Uploaded
fcaramia
parents:
diff changeset
703 # print the sequence to ambiguous.out if --ambiguous was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
704 if ($ambiguous and $return == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
705 print AMBIG ">$identifier\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
706 print AMBIG "$sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
707 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
708
2432df265dad Uploaded
fcaramia
parents:
diff changeset
709 # print the sequence to <unmapped.out> file if --un was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
710 elsif ($unmapped and $return == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
711 print UNMAPPED ">$identifier\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
712 print UNMAPPED "$sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
713 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
714 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
715 print "Processed $counting{sequences_count} sequences in total\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
716
2432df265dad Uploaded
fcaramia
parents:
diff changeset
717 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
718
2432df265dad Uploaded
fcaramia
parents:
diff changeset
719 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
720
2432df265dad Uploaded
fcaramia
parents:
diff changeset
721 sub process_single_end_fastQ_file_for_methylation_call{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
722 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
723 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
724 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
2432df265dad Uploaded
fcaramia
parents:
diff changeset
725 ### the C->T or G->A version
2432df265dad Uploaded
fcaramia
parents:
diff changeset
726
2432df265dad Uploaded
fcaramia
parents:
diff changeset
727 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
728 if ($sequence_file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
729 open (IN,"zcat $sequence_file |") or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
730 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
731 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
732 open (IN,$sequence_file) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
733 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
734
2432df265dad Uploaded
fcaramia
parents:
diff changeset
735 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
736
2432df265dad Uploaded
fcaramia
parents:
diff changeset
737 warn "\nReading in the sequence file $sequence_file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
738 while (1) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
739 my $identifier = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
740 my $sequence = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
741 my $identifier_2 = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
742 my $quality_value = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
743 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
744
2432df265dad Uploaded
fcaramia
parents:
diff changeset
745 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
746
2432df265dad Uploaded
fcaramia
parents:
diff changeset
747 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
748
2432df265dad Uploaded
fcaramia
parents:
diff changeset
749 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
750 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
751 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
752 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
753 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
754 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
755
2432df265dad Uploaded
fcaramia
parents:
diff changeset
756 $counting{sequences_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
757
2432df265dad Uploaded
fcaramia
parents:
diff changeset
758 if ($counting{sequences_count}%1000000==0) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
759 warn "Processed $counting{sequences_count} sequences so far\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
760 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
761 chomp $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
762 chomp $identifier;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
763 chomp $quality_value;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
764
2432df265dad Uploaded
fcaramia
parents:
diff changeset
765 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
2432df265dad Uploaded
fcaramia
parents:
diff changeset
766
2432df265dad Uploaded
fcaramia
parents:
diff changeset
767 my $return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
768 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
769 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
770 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
771 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
772 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
773 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
774
2432df265dad Uploaded
fcaramia
parents:
diff changeset
775 unless ($return){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
776 $return = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
777 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
778
2432df265dad Uploaded
fcaramia
parents:
diff changeset
779 # print the sequence to ambiguous.out if --ambiguous was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
780 if ($ambiguous and $return == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
781 print AMBIG "\@$identifier\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
782 print AMBIG "$sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
783 print AMBIG $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
784 print AMBIG "$quality_value\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
785 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
786
2432df265dad Uploaded
fcaramia
parents:
diff changeset
787 # print the sequence to <unmapped.out> file if --un was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
788 elsif ($unmapped and $return == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
789 print UNMAPPED "\@$identifier\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
790 print UNMAPPED "$sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
791 print UNMAPPED $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
792 print UNMAPPED "$quality_value\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
793 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
794 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
795 print "Processed $counting{sequences_count} sequences in total\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
796
2432df265dad Uploaded
fcaramia
parents:
diff changeset
797 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
798
2432df265dad Uploaded
fcaramia
parents:
diff changeset
799 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
800
2432df265dad Uploaded
fcaramia
parents:
diff changeset
801 sub process_fastA_files_for_paired_end_methylation_calls{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
802 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
803 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
2432df265dad Uploaded
fcaramia
parents:
diff changeset
804 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
805 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
806 ### converted genomes (either the C->T or G->A version)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
807
2432df265dad Uploaded
fcaramia
parents:
diff changeset
808 ### gzipped version of the infiles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
809 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
810 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
811 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
812 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
813 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
814 open (IN1,$sequence_file_1) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
815 open (IN2,$sequence_file_2) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
816 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
817
2432df265dad Uploaded
fcaramia
parents:
diff changeset
818 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
819 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
820
2432df265dad Uploaded
fcaramia
parents:
diff changeset
821 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
822
2432df265dad Uploaded
fcaramia
parents:
diff changeset
823 while (1) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
824 # reading from the first input file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
825 my $identifier_1 = <IN1>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
826 my $sequence_1 = <IN1>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
827 # reading from the second input file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
828 my $identifier_2 = <IN2>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
829 my $sequence_2 = <IN2>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
830 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
831
2432df265dad Uploaded
fcaramia
parents:
diff changeset
832 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
833 $identifier_2 = fix_IDs($identifier_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
834
2432df265dad Uploaded
fcaramia
parents:
diff changeset
835 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
836
2432df265dad Uploaded
fcaramia
parents:
diff changeset
837 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
838 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
839 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
840 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
841 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
842 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
843
2432df265dad Uploaded
fcaramia
parents:
diff changeset
844 $counting{sequences_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
845 if ($counting{sequences_count}%100000==0) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
846 warn "Processed $counting{sequences_count} sequences so far\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
847 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
848 my $orig_identifier_1 = $identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
849 my $orig_identifier_2 = $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
850
2432df265dad Uploaded
fcaramia
parents:
diff changeset
851 chomp $sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
852 chomp $identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
853 chomp $sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
854 chomp $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
855
2432df265dad Uploaded
fcaramia
parents:
diff changeset
856 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
2432df265dad Uploaded
fcaramia
parents:
diff changeset
857
2432df265dad Uploaded
fcaramia
parents:
diff changeset
858 my $return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
859 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
860 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
861 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
862 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
863 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
864 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
865
2432df265dad Uploaded
fcaramia
parents:
diff changeset
866 unless ($return){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
867 $return = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
868 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
869
2432df265dad Uploaded
fcaramia
parents:
diff changeset
870 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
871 if ($ambiguous and $return == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
872 print AMBIG_1 $orig_identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
873 print AMBIG_1 "$sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
874 print AMBIG_2 $orig_identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
875 print AMBIG_2 "$sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
876 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
877
2432df265dad Uploaded
fcaramia
parents:
diff changeset
878 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
879 elsif ($unmapped and $return == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
880 print UNMAPPED_1 $orig_identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
881 print UNMAPPED_1 "$sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
882 print UNMAPPED_2 $orig_identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
883 print UNMAPPED_2 "$sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
884 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
885 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
886
2432df265dad Uploaded
fcaramia
parents:
diff changeset
887 print "Processed $counting{sequences_count} sequences in total\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
888
2432df265dad Uploaded
fcaramia
parents:
diff changeset
889 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
890
2432df265dad Uploaded
fcaramia
parents:
diff changeset
891 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
892
2432df265dad Uploaded
fcaramia
parents:
diff changeset
893 sub process_fastQ_files_for_paired_end_methylation_calls{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
894 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
895 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
2432df265dad Uploaded
fcaramia
parents:
diff changeset
896 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
897 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
898 ### of the converted genomes (either C->T or G->A version)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
899
2432df265dad Uploaded
fcaramia
parents:
diff changeset
900 ### gzipped version of the infiles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
901 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
902 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
903 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
904 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
905 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
906 open (IN1,$sequence_file_1) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
907 open (IN2,$sequence_file_2) or die $!;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
908 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
909
2432df265dad Uploaded
fcaramia
parents:
diff changeset
910 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
911
2432df265dad Uploaded
fcaramia
parents:
diff changeset
912 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
913 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
914 while (1) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
915 # reading from the first input file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
916 my $identifier_1 = <IN1>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
917 my $sequence_1 = <IN1>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
918 my $ident_1 = <IN1>; # not needed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
919 my $quality_value_1 = <IN1>; # not needed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
920 # reading from the second input file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
921 my $identifier_2 = <IN2>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
922 my $sequence_2 = <IN2>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
923 my $ident_2 = <IN2>; # not needed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
924 my $quality_value_2 = <IN2>; # not needed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
925 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
926
2432df265dad Uploaded
fcaramia
parents:
diff changeset
927 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
928 $identifier_2 = fix_IDs($identifier_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
929
2432df265dad Uploaded
fcaramia
parents:
diff changeset
930 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
931
2432df265dad Uploaded
fcaramia
parents:
diff changeset
932 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
933 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
934 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
935 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
936 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
937 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
938
2432df265dad Uploaded
fcaramia
parents:
diff changeset
939 $counting{sequences_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
940 if ($counting{sequences_count}%100000==0) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
941 warn "Processed $counting{sequences_count} sequences so far\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
942 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
943
2432df265dad Uploaded
fcaramia
parents:
diff changeset
944 my $orig_identifier_1 = $identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
945 my $orig_identifier_2 = $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
946
2432df265dad Uploaded
fcaramia
parents:
diff changeset
947 chomp $sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
948 chomp $identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
949 chomp $sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
950 chomp $identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
951 chomp $quality_value_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
952 chomp $quality_value_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
953
2432df265dad Uploaded
fcaramia
parents:
diff changeset
954 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
2432df265dad Uploaded
fcaramia
parents:
diff changeset
955
2432df265dad Uploaded
fcaramia
parents:
diff changeset
956 my $return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
957 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
958 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
959 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
960 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
961 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
962 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
963
2432df265dad Uploaded
fcaramia
parents:
diff changeset
964 unless ($return){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
965 $return = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
966 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
967
2432df265dad Uploaded
fcaramia
parents:
diff changeset
968 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
969 if ($ambiguous and $return == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
970 # seq_1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
971 print AMBIG_1 $orig_identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
972 print AMBIG_1 "$sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
973 print AMBIG_1 $ident_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
974 print AMBIG_1 "$quality_value_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
975 # seq_2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
976 print AMBIG_2 $orig_identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
977 print AMBIG_2 "$sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
978 print AMBIG_2 $ident_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
979 print AMBIG_2 "$quality_value_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
980 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
981
2432df265dad Uploaded
fcaramia
parents:
diff changeset
982 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
983 elsif ($unmapped and $return == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
984 # seq_1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
985 print UNMAPPED_1 $orig_identifier_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
986 print UNMAPPED_1 "$sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
987 print UNMAPPED_1 $ident_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
988 print UNMAPPED_1 "$quality_value_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
989 # seq_2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
990 print UNMAPPED_2 $orig_identifier_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
991 print UNMAPPED_2 "$sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
992 print UNMAPPED_2 $ident_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
993 print UNMAPPED_2 "$quality_value_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
994 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
995 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
996
2432df265dad Uploaded
fcaramia
parents:
diff changeset
997 print "Processed $counting{sequences_count} sequences in total\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
998
2432df265dad Uploaded
fcaramia
parents:
diff changeset
999 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1000
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1001 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1002
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1003 sub check_bowtie_results_single_end{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1004 my ($sequence,$identifier,$quality_value) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1005
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1006 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1007 $quality_value = 'I'x(length$sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1008 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1009
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1010 my %mismatches = ();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1011 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1012 foreach my $index (0..$#fhs){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1013
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1014 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1015 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1016 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1017 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1018 ###############################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1019 ### STEP I Now processing the alignment stored in last_line ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1020 ###############################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1021 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1022 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1023 ### we only continue to extract useful information about this alignment if 1 was returned
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1024 if ($valid_alignment_found_1 == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1025 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1026 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1027 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1028
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1029 unless($mismatch_info){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1030 $mismatch_info = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1031 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1032
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1033 chomp $mismatch_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1034 my $chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1035 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1036 $chromosome = $mapped_chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1037 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1038 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1039 die "Chromosome number extraction failed for $mapped_chromosome\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1040 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1041 ### Now extracting the number of mismatches to the converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1042 my $number_of_mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1043 if ($mismatch_info eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1044 $number_of_mismatches = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1045 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1046 elsif ($mismatch_info =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1047 my @mismatches = split (/,/,$mismatch_info);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1048 $number_of_mismatches = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1049 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1050 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1051 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1052 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1053 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1054 my $alignment_location = join (":",$chromosome,$position);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1055 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1056 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1057 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1058 ### number for the found alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1059 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1060 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1061 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1062 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1063 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1064 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1065 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1066 $number_of_mismatches = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1067 ##################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1068 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1069 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1070 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1071 ##################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1072 my $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1073 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1074 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1075 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1076 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1077 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1078 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1079 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1080 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1081 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1082 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1083 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1084 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1085 ### we only continue to extract useful information about this second alignment if 1 was returned
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1086 if ($valid_alignment_found_2 == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1087 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1088 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1089 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1090 unless($mismatch_info){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1091 $mismatch_info = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1092 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1093 chomp $mismatch_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1094
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1095 my $chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1096 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1097 $chromosome = $mapped_chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1098 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1099 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1100 die "Chromosome number extraction failed for $mapped_chromosome\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1101 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1102
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1103 ### Now extracting the number of mismatches to the converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1104 my $number_of_mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1105 if ($mismatch_info eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1106 $number_of_mismatches = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1107 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1108 elsif ($mismatch_info =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1109 my @mismatches = split (/,/,$mismatch_info);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1110 $number_of_mismatches = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1111 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1112 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1113 die "Something weird is going on with the mismatch field\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1114 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1115 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1116 ### extracting the chromosome number from the bowtie output (see above)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1117 my $alignment_location = join (":",$chromosome,$position);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1118 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1119 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1120 ### case we are not writing the same entry out a second time.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1121 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1122 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1123 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1124 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1125 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1126 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1127 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1128 ####################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1129 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1130 ####################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1131 $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1132 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1133 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1134 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1135 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1136 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1137 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1138 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1139 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1140 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1141 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1142 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1143 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1144 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1145 ### still within the 2nd sequence in correct orientation found
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1146 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1147 ### still withing the 1st sequence in correct orientation found
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1148 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1149 ### still within the if (last_seq_id eq identifier) condition
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1150 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1151 ### still within foreach index loop
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1152 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1153 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1154 unless(%mismatches){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1155 $counting{no_single_alignment_found}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1156 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1157 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1158 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1159 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1160 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1161 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1162 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1163 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1164 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1165 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1166 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1167 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1168 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1169 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1170 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1171 my $sequence_fails = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1172 ### Declaring an empty hash reference which will store all information we need for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1173 my $methylation_call_params; # hash reference!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1174 ### sorting in ascending order
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1175 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1176
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1177 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1178 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1179 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1180 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1181 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1182 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1183 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1184 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1185 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1186 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1187 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1188 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1189 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1190 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1191 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1192 ### reaction. E.g.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1193 ### CAGTCACGCGCGCGCG will become
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1194 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1195 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1196 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1197 ### G->A conversion:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1198 ### highly methylated: CAATCACACACACACA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1199 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1200 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1201 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1202 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1203 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1204 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1205 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1206 ### In the above example the number of transliterations required to transform the actual sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1207 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1208 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1209 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1210 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1211 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1212 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1213 my @three_candidate_seqs;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1214 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1215 my $transliterations_performed;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1216 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1217 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1218 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1219 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1220 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1221 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1222 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1223 die "unexpected index number range $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1224 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1225 push @three_candidate_seqs,{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1226 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1227 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1228 mismatch_number => $mismatch_number,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1229 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1230 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1231 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1232 transliterations_performed => $transliterations_performed,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1233 };
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1234 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1235 ### sorting in ascending order for the lowest number of transliterations performed
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1236 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1237 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1238 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1239 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1240 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1241 if (($first_array_element*2) < $second_array_element){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1242 $counting{low_complexity_alignments_overruled_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1243 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1244 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1245 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1246 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1247 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1248 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1249 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1250 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1251 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1252 $sequence_fails = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1253 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1254 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1255 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1256 $sequence_fails = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1257 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1258 ### after processing the alignment with the lowest number of mismatches we exit
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1259 last;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1260 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1261 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1262 if ($sequence_fails == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1263 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1264 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1265 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1266 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1267 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1268 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1269 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1270 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1271 return 0; # => exits to next sequence (default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1272 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1273 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1274
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1275 ### --DIRECTIONAL
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1276 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1277 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1278 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1279 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1280 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1281 $counting{alignments_rejected_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1282 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1283 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1284 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1285
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1286 ### If the sequence has not been rejected so far it will have a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1287 $counting{unique_best_alignment_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1288 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1289 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1290 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1291 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1292 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1293 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1294 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1295
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1296 ### otherwise we are set to perform the actual methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1297 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1298
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1299 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1300 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1301 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1302
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1303 sub check_bowtie_results_single_end_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1304 my ($sequence,$identifier,$quality_value) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1305
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1306 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1307 $quality_value = 'I'x(length$sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1308 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1309
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1310 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1311 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1312
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1313 my $alignment_ambiguous = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1314
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1315 my %alignments = ();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1316
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1317 ### reading from the Bowtie 2 output filehandles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1318 foreach my $index (0..$#fhs){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1319 # print "Index: $index\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1320 # print "$fhs[$index]->{last_line}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1321 # print "$fhs[$index]->{last_seq_id}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1322
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1323 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1324 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1325
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1326 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1327 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1328
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1329 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1330
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1331 # SAM format specifications for Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1332 # (1) Name of read that aligned
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1333 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1334 # 1 The read is one of a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1335 # 2 The alignment is one end of a proper paired-end alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1336 # 4 The read has no reported alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1337 # 8 The read is one of a pair and has no reported alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1338 # 16 The alignment is to the reverse reference strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1339 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1340 # 64 The read is mate 1 in a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1341 # 128 The read is mate 2 in a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1342 # 256 The read has multiple mapping states
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1343 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1344 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1345 # (5) Mapping quality (255 means MAPQ is not available)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1346 # (6) CIGAR string representation of alignment (* if unavailable)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1347 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1348 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1349 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1350 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1351 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1352 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1353 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1354 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1355 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1356 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1357 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1358 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1359 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1360 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1361 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1362 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1363
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1364 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1365
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1366 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1367 if ($flag == 4){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1368 ## reading in the next alignment, which must be the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1369 my $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1370 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1371 chomp $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1372 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1373 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1374 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1375 if ($seq_id eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1376 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1377 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1378 next; # next instance
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1379 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1380 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1381 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1382 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1383 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1384 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1385 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1386 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1387
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1388 # if there are one or more proper alignments we can extract the chromosome number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1389 my $chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1390 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1391 $chromosome = $mapped_chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1392 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1393 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1394 die "Chromosome number extraction failed for $mapped_chromosome\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1395 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1396
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1397 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1398 my ($alignment_score,$second_best,$MD_tag);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1399 my @fields = split (/\t/,$fhs[$index]->{last_line});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1400
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1401 foreach (11..$#fields){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1402 if ($fields[$_] =~ /AS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1403 $alignment_score = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1404 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1405 elsif ($fields[$_] =~ /XS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1406 $second_best = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1407 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1408 elsif ($fields[$_] =~ /MD:Z:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1409 $MD_tag = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1410 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1411 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1412
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1413 # warn "First best alignment_score is: '$alignment_score'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1414 # warn "MD tag is: '$MD_tag'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1415 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1416
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1417 if (defined $second_best){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1418 # warn "second best alignment_score is: '$second_best'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1419
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1420 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1421 if ($alignment_score == $second_best){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1422 $alignment_ambiguous = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1423 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1424 until ($fhs[$index]->{last_seq_id} ne $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1425 my $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1426 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1427 chomp $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1428 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1429 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1430 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1431 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1432 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1433 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1434 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1435 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1436 last; # break free in case we have reached the end of the alignment output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1437 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1438 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1439 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1440 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1441 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1442
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1443 my $alignment_location = join (":",$chromosome,$position);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1444
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1445 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1446 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1447 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1448 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1449
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1450 unless (exists $alignments{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1451 $alignments{$alignment_location}->{seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1452 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1453 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1454 $alignments{$alignment_location}->{index} = $index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1455 $alignments{$alignment_location}->{chromosome} = $chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1456 $alignments{$alignment_location}->{position} = $position;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1457 $alignments{$alignment_location}->{CIGAR} = $cigar;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1458 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1459 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1460
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1461 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1462 until ($fhs[$index]->{last_seq_id} ne $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1463 my $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1464 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1465 chomp $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1466 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1467 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1468 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1469 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1470 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1471 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1472 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1473 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1474 last; # break free in case we have reached the end of the alignment output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1475 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1476 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1477 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1478 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1479 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1480 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1481
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1482 my $alignment_location = join (":",$chromosome,$position);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1483
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1484 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1485 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1486 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1487 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1488
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1489 unless (exists $alignments{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1490 $alignments{$alignment_location}->{seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1491 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1492 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1493 $alignments{$alignment_location}->{index} = $index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1494 $alignments{$alignment_location}->{chromosome} = $chromosome;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1495 $alignments{$alignment_location}->{position} = $position;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1496 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1497 $alignments{$alignment_location}->{CIGAR} = $cigar;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1498 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1499
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1500 my $newline = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1501 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1502 chomp $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1503 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1504 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1505 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1506 if ($seq_id eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1507 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1508 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1509 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1510 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1511 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1512 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1513 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1514 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1515 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1516 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1517 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1518
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1519 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1520 if ($alignment_ambiguous == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1521 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1522 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1523 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1524 # print "$ambiguous_read_output\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1525
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1526 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1527 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1528 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1529 elsif ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1530 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1531 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1532 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1533 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1534 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1535 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1536
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1537 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1538 unless(%alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1539 $counting{no_single_alignment_found}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1540 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1541 # print "$unmapped_read_output\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1542 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1543 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1544 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1545 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1546 return 0; # default
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1547 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1548 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1549
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1550 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1551
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1552 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1553 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1554 ### alignment score we are discarding the sequence altogether.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1555 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1556 ### opening (5) and extending (3 per bp) the gap.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1557
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1558 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1559
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1560 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1561 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1562
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1563 ### print contents of %alignments for debugging
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1564 # if (scalar keys %alignments > 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1565 # print "\n******\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1566 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1567 # print "Loc: $alignment_location\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1568 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1569 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1570 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1571 # print "Index $alignments{$alignment_location}->{index}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1572 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1573 # print "pos: $alignments{$alignment_location}->{position}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1574 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1575 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1576 # print "\n******\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1577 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1578
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1579 ### if there is only 1 entry in the hash with we accept it as the best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1580 if (scalar keys %alignments == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1581 for my $unique_best_alignment (keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1582 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1583 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1584 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1585 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1586 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1587 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1588 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1589 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1590 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1591
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1592 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1593 ### we boot the sequence altogether
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1594 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1595 my $best_alignment_score;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1596 my $best_alignment_location;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1597 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1598 # print "$alignments{$alignment_location}->{alignment_score}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1599 unless (defined $best_alignment_score){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1600 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1601 $best_alignment_location = $alignment_location;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1602 # print "setting best alignment score: $best_alignment_score\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1603 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1604 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1605 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1606 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1607 # warn "Same alignment score, the sequence will get booted!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1608 $sequence_fails = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1609 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1610 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1611 ### else we are going to store the best alignment for further processing
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1612 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1614 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1615 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1616 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1617 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1618 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1619 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1620 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1621 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1622 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1623 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1624 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1625 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1626 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1627 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1628
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1629 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1630 if ($sequence_fails == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1631 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1632
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1633 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1634 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1635 # print OUT "$ambiguous_read_output\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1636
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1637 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1638 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1639 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1640 elsif ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1641 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1642 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1643 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1644 return 0; # => exits to next sequence (default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1645 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1646 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1647
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1648 ### --DIRECTIONAL
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1649 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1650 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1651 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1652 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1653 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1654 $counting{alignments_rejected_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1655 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1656 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1657 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1658
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1659 ### If the sequence has not been rejected so far it has a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1660 $counting{unique_best_alignment_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1661
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1662 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1663 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1664
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1665 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1666 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1667 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1668 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1669 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1670 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1671
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1672
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1673 ### otherwise we are set to perform the actual methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1674 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1675 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1676 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1677 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1678
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1679
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1680 sub determine_number_of_transliterations_performed{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1681 my ($sequence,$read_conversion) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1682 my $number_of_transliterations;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1683 if ($read_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1684 $number_of_transliterations = $sequence =~ tr/C/T/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1685 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1686 elsif ($read_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1687 $number_of_transliterations = $sequence =~ tr/G/A/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1688 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1689 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1690 die "Read conversion mode of the read was not specified $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1691 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1692 return $number_of_transliterations;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1693 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1694
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1695 sub decide_whether_single_end_alignment_is_valid{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1696 my ($index,$identifier) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1697
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1698 # extracting from Bowtie 1 format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1699 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1700
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1701 ### ensuring that the entry is the correct sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1702 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1703 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1704 ### sensible alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1705 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1706 ### If the orientation was correct can we move on
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1707 if ($orientation == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1708 return 1; ### 1st possibility for a sequence to pass
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1709 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1710 ### If the alignment was in the wrong orientation we need to read in a new line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1711 elsif($orientation == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1712 my $newline = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1713 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1714 ($id,$strand) = (split (/\t/,$newline))[0,1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1715
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1716 ### ensuring that the next entry is still the correct sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1717 if ($id eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1718 ### checking orientation again
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1719 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1720 ### If the orientation was correct can we move on
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1721 if ($orientation == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1722 $fhs[$index]->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1723 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1724 return 1; ### 2nd possibility for a sequence to pass
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1725 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1726 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1727 elsif ($orientation == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1728 $newline = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1729 if ($newline){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1730 my ($seq_id) = split (/\t/,$newline);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1731 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1732 ### the same fields of the just read next entry
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1733 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1734 $fhs[$index]->{last_seq_id} = $seq_id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1735 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1736 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1737 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1738 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1739 # assigning undef to last_seq_id and last_line (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1740 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1741 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1742 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1743 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1744 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1745 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1746 die "The orientation of the alignment must be either correct or incorrect\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1747 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1748 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1749 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1750 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1751 $fhs[$index]->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1752 $fhs[$index]->{last_line} = $newline;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1753 return 0; # processing the new alignment result only in the next round
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1754 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1755 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1756 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1757 # assigning undef to last_seq_id and last_line (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1758 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1759 $fhs[$index]->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1760 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1761 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1762 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1763 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1764 die "The orientation of the alignment must be either correct or incorrect\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1765 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1766 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1767 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1768 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1769 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1770 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1771 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1772 #########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1773 ### BOWTIE 1 | PAIRED-END
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1774 #########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1775
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1776 sub check_bowtie_results_paired_ends{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1777 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1778
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1779 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1780 unless ($quality_value_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1781 $quality_value_1 = 'I'x(length$sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1782 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1783 unless ($quality_value_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1784 $quality_value_2 = 'I'x(length$sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1785 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1786
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1787 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1788
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1789 my %mismatches = ();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1790 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1791
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1792
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1793 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1794 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1795 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1796 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1797 ### strands are not being reported by specifying --directional
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1798
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1799 foreach my $index (0,3,1,2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1800 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1801 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1802 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1803 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1804 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1805
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1806 ##################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1807 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1808 ##################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1809 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1810 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1811 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1812 if ($valid_alignment_found == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1813 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1814 ### we store the useful information in %mismatches
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1815 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1816 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1817 chomp $mismatch_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1818 chomp $mismatch_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1819
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1820 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1821 my ($chromosome_1,$chromosome_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1822 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1823 $chromosome_1 = $mapped_chromosome_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1824 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1825 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1826 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1827 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1828 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1829 $chromosome_2 = $mapped_chromosome_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1830 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1831 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1832 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1833 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1834
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1835 ### Now extracting the number of mismatches to the converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1836 my $number_of_mismatches_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1837 my $number_of_mismatches_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1838 if ($mismatch_info_1 eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1839 $number_of_mismatches_1 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1840 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1841 elsif ($mismatch_info_1 =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1842 my @mismatches = split (/,/,$mismatch_info_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1843 $number_of_mismatches_1 = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1844 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1845 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1846 die "Something weird is going on with the mismatch field\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1847 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1848 if ($mismatch_info_2 eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1849 $number_of_mismatches_2 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1850 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1851 elsif ($mismatch_info_2 =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1852 my @mismatches = split (/,/,$mismatch_info_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1853 $number_of_mismatches_2 = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1854 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1855 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1856 die "Something weird is going on with the mismatch field\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1857 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1858 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1859 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1860 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1861 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1862 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1863 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1864 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1865 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1866 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1867 ### number for the found alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1868 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1869 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1870 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1871 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1872 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1873 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1874 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1875 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1876 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1877 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1878 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1879 ###################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1880 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1881 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1882 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1883 ### this round ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1884 ###################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1885 my $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1886 my $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1887
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1888 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1889 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1890 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1891
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1892 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1893 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1894 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1895 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1896 $fhs[$index]->{last_seq_id} = $seq_id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1897 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1898 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1899 die "Either read 1 or read 2 needs to end on '/1'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1900 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1901
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1902 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1903 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1904 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1905 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1906 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1907 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1908 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1909 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1910 next; # jumping to the next index
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1911 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1912 ### Now processing the entry we just stored in last_line_1 and last_line_2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1913 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1914 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1915 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1916 if ($valid_alignment_found == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1917 ### we store the useful information in %mismatches
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1918 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1919 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1920 chomp $mismatch_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1921 chomp $mismatch_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1922 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1923 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1924 $chromosome_1 = $mapped_chromosome_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1925 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1926 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1927 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1928 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1929 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1930 $chromosome_2 = $mapped_chromosome_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1931 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1932 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1933 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1934 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1935
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1936 $number_of_mismatches_1='';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1937 $number_of_mismatches_2='';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1938 ### Now extracting the number of mismatches to the converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1939 if ($mismatch_info_1 eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1940 $number_of_mismatches_1 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1941 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1942 elsif ($mismatch_info_1 =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1943 my @mismatches = split (/,/,$mismatch_info_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1944 $number_of_mismatches_1 = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1945 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1946 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1947 die "Something weird is going on with the mismatch field\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1948 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1949 if ($mismatch_info_2 eq ''){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1950 $number_of_mismatches_2 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1951 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1952 elsif ($mismatch_info_2 =~ /^\d/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1953 my @mismatches = split (/,/,$mismatch_info_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1954 $number_of_mismatches_2 = scalar @mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1955 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1956 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1957 die "Something weird is going on with the mismatch field\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1958 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1959 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1960 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1961 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1962 die "position 1 is greater than position 2" if ($position_1 > $position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1963 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1964 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1965 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1966 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1967 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1968 ### number for the found alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1969 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1970 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1971 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1972 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1973 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1974 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1975 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1976 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1977 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1978 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1979 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1980 ###############################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1981 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1982 ###############################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1983 $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1984 $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1985
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1986 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1987 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1988 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1989
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1990 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1991 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1992 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1993 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1994 $fhs[$index]->{last_seq_id} = $seq_id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1995 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1996 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1997 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1998 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
1999 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2000 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2001 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2002 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2003 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2004 next; # jumping to the next index
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2005 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2006 ### within the 2nd sequence pair alignment in correct orientation found
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2007 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2008 ### within the 1st sequence pair alignment in correct orientation found
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2009 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2010 ### still within the (last_seq_id eq identifier) condition
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2011 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2012 ### still within foreach index loop
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2013 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2014 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2015 unless(%mismatches){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2016 $counting{no_single_alignment_found}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2017 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2018 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2019 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2020 my $sequence_pair_fails = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2021 ### Declaring an empty hash reference which will store all information we need for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2022 my $methylation_call_params; # hash reference!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2023 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2024 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2025 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2026 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2027 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2028 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2029 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2030 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2031 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2032 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2033 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2034 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2035 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2036 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2037 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2038 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2039 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2040 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2041 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2042 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2043 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2044 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2045 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2046 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2047 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2048 $sequence_pair_fails = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2049 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2050 ### after processing the alignment with the lowest number of mismatches we exit
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2051 last;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2052 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2053 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2054 if ($sequence_pair_fails == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2055 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2056 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2057 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2058 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2059 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2060 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2061 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2062 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2063 return 0; # => exits to next sequence (default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2064 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2065 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2066
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2067 ### --DIRECTIONAL
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2068 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2069 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2070 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2071 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2072 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2073 $counting{alignments_rejected_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2074 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2075 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2076 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2077
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2078 ### If the sequence has not been rejected so far it does have a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2079 $counting{unique_best_alignment_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2080 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2081
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2082 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2083 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2084 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2085 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2086 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2087 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2088 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2089 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2090 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2091 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2092 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2093
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2094 ### otherwise we are set to perform the actual methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2095 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2096 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2097
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2098 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2099 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2100 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2101
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2102 #########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2103 ### BOWTIE 2 | PAIRED-END
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2104 #########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2105
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2106 sub check_bowtie_results_paired_ends_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2107 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2108
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2109 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2110 unless ($quality_value_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2111 $quality_value_1 = 'I'x(length$sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2112 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2113
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2114 unless ($quality_value_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2115 $quality_value_2 = 'I'x(length$sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2116 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2117
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2118
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2119 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2120
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2121
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2122 my %alignments;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2123 my $alignment_ambiguous = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2124
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2125 ### reading from the Bowtie 2 output filehandles
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2126
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2127 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2128 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2129 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2130 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2131 ### strands are not being reported when '--directional' is specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2132
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2133 foreach my $index (0,3,1,2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2134 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2135 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2136
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2137 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2138 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2139
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2140 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2141 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2142 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2143 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2144 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2145 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2146 $id_1 =~ s/\/1$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2147 $id_2 =~ s/\/2$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2148
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2149 # SAM format specifications for Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2150 # (1) Name of read that aligned
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2151 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2152 # 1 The read is one of a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2153 # 2 The alignment is one end of a proper paired-end alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2154 # 4 The read has no reported alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2155 # 8 The read is one of a pair and has no reported alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2156 # 16 The alignment is to the reverse reference strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2157 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2158 # 64 The read is mate 1 in a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2159 # 128 The read is mate 2 in a pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2160 # 256 The read has multiple mapping states
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2161 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2162 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2163 # (5) Mapping quality (255 means MAPQ is not available)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2164 # (6) CIGAR string representation of alignment (* if unavailable)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2165 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2166 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2167 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2168 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2169 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2170 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2171 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2172 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2173 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2174 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2175 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2176 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2177 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2178 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2179 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2180 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2181
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2182 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2183 ### We can store the next alignment and move on to the next Bowtie 2 instance
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2184 if ($flag_1 == 77 and $flag_2 == 141){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2185 ## reading in the next alignment, which must be the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2186 my $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2187 my $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2188
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2189 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2190 chomp $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2191 chomp $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2192 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2193 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2194 $seq_id_1 =~ s/\/1$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2195 $seq_id_2 =~ s/\/2$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2196 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2197 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2198 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2199
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2200 # print "current sequence ($identifier) did not map, reading in next sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2201 # print "$index\t$fhs[$index]->{last_seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2202 # print "$index\t$fhs[$index]->{last_line_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2203 # print "$index\t$fhs[$index]->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2204 next; # next instance
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2205 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2206 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2207 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2208 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2209 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2210 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2211 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2212 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2213 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2214
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2215 ### If there are one or more proper alignments we can extract the chromosome number
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2216 my ($chromosome_1,$chromosome_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2217 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2218 $chromosome_1 = $mapped_chromosome_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2219 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2220 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2221 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2222 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2223 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2224 $chromosome_2 = $mapped_chromosome_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2225 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2226 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2227 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2228 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2229
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2230 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2231
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2232 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2233 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2234
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2235 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2236 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2237
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2238 foreach (11..$#fields_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2239 if ($fields_1[$_] =~ /AS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2240 $alignment_score_1 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2241 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2242 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2243 $second_best_1 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2244 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2245 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2246 $MD_tag_1 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2247 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2248 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2249
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2250 foreach (11..$#fields_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2251 if ($fields_2[$_] =~ /AS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2252 $alignment_score_2 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2253 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2254 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2255 $second_best_2 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2256 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2257 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2258 $MD_tag_2 = $1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2259 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2260 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2261
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2262 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2263 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2264
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2265 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2266 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2267 # warn "MD tag 1 is: '$MD_tag_1'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2268 # warn "MD tag 2 is: '$MD_tag_2'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2269
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2270 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2271 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2272 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2273
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2274 if (defined $second_best_1 and defined $second_best_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2275 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2276 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2277 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2278 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2279
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2280 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2281 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2282 $alignment_ambiguous = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2283 # print "This read will be chucked (AS==XS detected)!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2284
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2285 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2286 until ($fhs[$index]->{last_seq_id} ne $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2287 my $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2288 my $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2289 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2290 chomp $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2291 chomp $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2292 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2293 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2294 $seq_id_1 =~ s/\/1$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2295 $seq_id_2 =~ s/\/2$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2296 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2297
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2298 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2299 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2300 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2301 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2302 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2303 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2304 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2305 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2306 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2307 last; # break free if the end of the alignment output was reached
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2308 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2309 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2310 # if ($fhs[$index]->{last_seq_id}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2311 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2312 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2313 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2314 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2315
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2316 my $alignment_location;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2317 if ($position_1 <= $position_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2318 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2319 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2320 elsif($position_2 < $position_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2321 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2322 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2323
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2324 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2325 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2326 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2327 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2328
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2329 unless (exists $alignments{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2330 $alignments{$alignment_location}->{seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2331 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2332 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2333 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2334 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2335 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2336 $alignments{$alignment_location}->{index} = $index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2337 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2338 $alignments{$alignment_location}->{position_1} = $position_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2339 $alignments{$alignment_location}->{position_2} = $position_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2340 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2341 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2342 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2343 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2344 $alignments{$alignment_location}->{flag_1} = $flag_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2345 $alignments{$alignment_location}->{flag_2} = $flag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2346 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2347 # warn "added best of several alignments to \%alignments hash\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2348
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2349 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2350 until ($fhs[$index]->{last_seq_id} ne $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2351 my $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2352 my $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2353 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2354 chomp $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2355 chomp $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2356 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2357 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2358 $seq_id_1 =~ s/\/1$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2359 $seq_id_2 =~ s/\/2$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2360 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2361
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2362 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2363 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2364 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2365 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2366 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2367 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2368 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2369 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2370 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2371 last; # break free if the end of the alignment output was reached
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2372 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2373 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2374 # if($fhs[$index]->{last_seq_id}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2375 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2376 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2377 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2378 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2379 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2380
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2381 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2382 # print "$alignment_location\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2383 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2384 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2385 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2386 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2387
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2388 unless (exists $alignments{$alignment_location}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2389 $alignments{$alignment_location}->{seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2390 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2391 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2392 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2393 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2394 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2395 $alignments{$alignment_location}->{index} = $index;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2396 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2397 $alignments{$alignment_location}->{position_1} = $position_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2398 $alignments{$alignment_location}->{position_2} = $position_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2399 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2400 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2401 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2402 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2403 $alignments{$alignment_location}->{flag_1} = $flag_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2404 $alignments{$alignment_location}->{flag_2} = $flag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2405 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2406
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2407 # warn "added unique alignment to \%alignments hash\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2408
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2409 # Now reading and storing the next read pair
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2410 my $newline_1 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2411 my $newline_2 = $fhs[$index]->{fh}-> getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2412 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2413 chomp $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2414 chomp $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2415 # print "$newline_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2416 # print "$newline_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2417 my ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2418 my ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2419 $seq_id_1 =~ s/\/1$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2420 $seq_id_2 =~ s/\/2$//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2421 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2422
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2423 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2424 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2425 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2426
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2427 if ($seq_id_1 eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2428 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2429 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2430 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2431 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2432 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2433 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2434 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2435 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2436 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2437 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2438 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2439 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2440
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2441 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2442 if ($alignment_ambiguous == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2443 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2444 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2445 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2446 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2447 # print "$ambiguous_read_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2448 # print "$ambiguous_read_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2449
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2450 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2451 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2452 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2453 elsif ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2454 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2455 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2456 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2457 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2458 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2459 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2460
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2461 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2462 unless (%alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2463 $counting{no_single_alignment_found}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2464
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2465 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2466 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2467 # print "$unmapped_read_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2468 # print "$unmapped_read_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2469 if ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2470 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2471 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2472 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2473 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2474 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2475 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2476
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2477 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2478
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2479 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2480 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2481 ### alignment score we are discarding the sequence pair altogether.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2482 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2483 ### and extending (3 per bp) the gap.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2484
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2485 #######################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2486
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2487 ### Declaring an empty hash reference which will store all information we need for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2488 my $methylation_call_params; # hash reference
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2489 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2490
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2491 ### print contents of %alignments for debugging
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2492 ## if (scalar keys %alignments >= 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2493 # print "\n******\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2494 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2495 # print "Loc: $alignment_location\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2496 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2497 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2498 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2499 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2500 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2501 # print "Index $alignments{$alignment_location}->{index}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2502 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2503 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2504 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2505 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2506 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2507 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2508 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2509 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2510 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2511 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2512 # print "\n******\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2513 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2514
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2515 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2516 if (scalar keys %alignments == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2517 for my $unique_best_alignment (keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2518 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2519 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2520 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2521 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2522 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2523 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2524 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2525 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2526 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2527 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2528 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2529 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2530 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2531 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2532 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2533 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2534 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2535
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2536 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2537 ### we boot the sequence pair altogether)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2538 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2539 my $best_sum_of_alignment_scores;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2540 my $best_alignment_location;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2541 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2542 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2543 unless (defined $best_sum_of_alignment_scores){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2544 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2545 $best_alignment_location = $alignment_location;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2546 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2547 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2548 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2549 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2550 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2551 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2552 $sequence_pair_fails = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2553 last; # exiting since we know that the sequence has ambiguous alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2554 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2555 ### else we are going to store the best alignment for further processing
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2556 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2557 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2558 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2559 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2560 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2561 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2562 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2563 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2564 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2565 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2566 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2567 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2568 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2569 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2570 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2571 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2572 last; # exiting since the sequence produced a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2573 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2574 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2575 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2576 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2577 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2578 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2579 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2580
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2581 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2582 if ($sequence_pair_fails == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2583 $counting{unsuitable_sequence_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2584
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2585 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2586 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2587 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2588 # print "$ambiguous_read_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2589 # print "$ambiguous_read_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2590
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2591 if ($ambiguous){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2592 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2593 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2594 elsif ($unmapped){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2595 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2596 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2597 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2598 return 0; # => exits to next sequence pair (default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2599 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2600 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2601
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2602 ### --DIRECTIONAL
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2603 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2604 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2605 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2606 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2607 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2608 $counting{alignments_rejected_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2609 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2610 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2611 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2612
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2613 ### If the sequence pair has not been rejected so far it does have a unique best alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2614 $counting{unique_best_alignment_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2615 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2616
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2617 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2618 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2619 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2620 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2621 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2622 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2623 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2624 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2625 $counting{genomic_sequence_could_not_be_extracted_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2626 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2627 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2628
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2629 ### now we are set to perform the actual methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2630 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2631 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2632 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2633 # print " $sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2634 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2635 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2636
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2637 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2638 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2639 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2640
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2641 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2642
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2643 sub decide_whether_paired_end_alignment_is_valid{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2644 my ($index,$identifier) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2645 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2646 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2647 chomp $mismatch_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2648 chomp $mismatch_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2649 my $seq_id_1 = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2650 my $seq_id_2 = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2651 $seq_id_1 =~ s/\/1$//; # removing the read /1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2652 $seq_id_2 =~ s/\/1$//; # removing the read /1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2653
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2654 ### ensuring that the current entry is the correct sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2655 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2656 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2657 ### sensible alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2658 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2659 ### If the orientation was correct can we move on
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2660 if ($orientation == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2661 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2662 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2663 ### If the alignment was in the wrong orientation we need to read in two new lines
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2664 elsif($orientation == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2665 my $newline_1 = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2666 my $newline_2 = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2667 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2668 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2669 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2670 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2671
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2672 my $seqid;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2673 $seq_id_1 = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2674 $seq_id_2 = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2675 # we need to capture the first read (ending on /1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2676 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2677 $seqid = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2678 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2679 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2680 $seqid = $seq_id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2681 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2682 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2683 die "One of the two reads needs to end on /1!!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2684 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2685
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2686 ### ensuring that the next entry is still the correct sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2687 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2688 ### checking orientation again
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2689 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2690 ### If the orientation was correct can we move on
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2691 if ($orientation == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2692 ### Writing the current sequence to last_line_1 and last_line_2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2693 $fhs[$index]->{last_seq_id} = $seqid;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2694 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2695 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2696 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2697 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2698 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2699 ### the next entry)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2700 elsif ($orientation == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2701 $newline_1 = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2702 $newline_2 = $fhs[$index]->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2703 if ($newline_1 and $newline_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2704 ($seq_id_1) = split (/\t/,$newline_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2705 ($seq_id_2) = split (/\t/,$newline_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2706
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2707 $seqid = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2708 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2709 $seqid = $seq_id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2710 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2711 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2712 $seqid = $seq_id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2713 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2714 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2715 die "One of the two reads needs to end on /1!!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2716 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2717
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2718 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2719 ### the same fields of the just read next entry
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2720 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2721 $fhs[$index]->{last_seq_id} = $seqid;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2722 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2723 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2724 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2725 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2726 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2727 ### assigning undef to last_seq_id and last_line (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2728 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2729 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2730 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2731 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2732 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2733 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2734 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2735 die "The orientation of the alignment must be either correct or incorrect\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2736 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2737 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2738 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2739 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2740 $fhs[$index]->{last_seq_id} = $seqid;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2741 $fhs[$index]->{last_line_1} = $newline_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2742 $fhs[$index]->{last_line_2} = $newline_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2743 return 0; # processing the new alignment result only in the next round
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2744 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2745 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2746 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2747 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2748 $fhs[$index]->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2749 $fhs[$index]->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2750 $fhs[$index]->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2751 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2752 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2753 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2754 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2755 die "The orientation of the alignment must be either correct or incorrect\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2756 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2757 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2758 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2759 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2760 return 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2761 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2762 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2763
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2764 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2765
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2766 sub extract_corresponding_genomic_sequence_paired_ends {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2767 my ($sequence_identifier,$methylation_call_params) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2768 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2769 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2770 my $alignment_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2771 my $alignment_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2772 my $read_conversion_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2773 my $read_conversion_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2774 my $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2775
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2776 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2777 ### if the C happens to be at the first or last position of the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2778 my $non_bisulfite_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2779 my $non_bisulfite_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2780
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2781 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2782 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2783 ### sequences around!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2784 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2785 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2786 ### [Index 0, sequence originated from (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2787 $counting{CT_GA_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2788 $alignment_read_1 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2789 $alignment_read_2 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2790 $read_conversion_info_1 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2791 $read_conversion_info_2 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2792 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2793 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2794 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2795
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2796 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2797
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2798 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2799 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2800 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2801
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2802 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2803 ### the reverse strand sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2804 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2805 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2806 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2807 $non_bisulfite_sequence_2 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2808 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2809 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2810
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2811 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2812 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2813 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2814 $counting{GA_CT_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2815 $alignment_read_1 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2816 $alignment_read_2 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2817 $read_conversion_info_1 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2818 $read_conversion_info_2 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2819 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2820
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2821 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2822 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2823 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2824 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2825 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2826 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2827 $non_bisulfite_sequence_1 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2828 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2829
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2830 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2831 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2832 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2833 ### the reverse strand sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2834 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2835 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2836
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2837 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2838 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2839 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2840 $counting{GA_CT_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2841 $alignment_read_1 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2842 $alignment_read_2 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2843 $read_conversion_info_1 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2844 $read_conversion_info_2 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2845 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2846
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2847 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2848 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2849 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2850 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2851 ### the reverse strand sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2852 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2853
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2854 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2855 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2856 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2857 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2858 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2859 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2860 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2861 $non_bisulfite_sequence_2 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2862 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2863 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2864
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2865 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2866 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2867 ### [Index 3, sequence originated from the (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2868 $counting{CT_GA_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2869 $alignment_read_1 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2870 $alignment_read_2 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2871 $read_conversion_info_1 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2872 $read_conversion_info_2 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2873 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2874
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2875 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2876 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2877 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2878 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2879 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2880 ### the reverse strand sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2881 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2882 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2883 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2884 $non_bisulfite_sequence_1 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2885 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2886
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2887 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2888 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2889 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2890 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2891 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2892 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2893 die "Too many bowtie result filehandles\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2894 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2895 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2896 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2897
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2898 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2899 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2900 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2901 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2902 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2903 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2904 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2905 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2906
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2907 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2908
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2909 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2910 my ($sequence_identifier,$methylation_call_params) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2911 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2912 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2913
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2914 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2915 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2916 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2917 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2918 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2919 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2920 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2921
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2922 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2923 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2924 my $alignment_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2925 my $alignment_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2926 my $read_conversion_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2927 my $read_conversion_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2928 my $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2929
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2930 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2931 ### if the C happens to be at the last position of the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2932 my $non_bisulfite_sequence_1 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2933 my $non_bisulfite_sequence_2 = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2934
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2935 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2936 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2937 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2938
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2939 # parsing CIGAR 1 string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2940 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2941 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2942 shift @ops_1; # remove the empty first element
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2943 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2944 # parsing CIGAR 2 string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2945 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2946 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2947 shift @ops_2; # remove the empty first element
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2948 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2949
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2950 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2951 my $indels_2 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2952
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2953 ### Extracting read 1 genomic sequence ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2954
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2955 # extracting 2 additional bp at the 5' end (read 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2956 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2957 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2958 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2959 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2960 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2961 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2962 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2963 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2964
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2965 foreach (0..$#len_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2966 if ($ops_1[$_] eq 'M'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2967 # extracting genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2968 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2969 # warn "$non_bisulfite_sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2970 # adjusting position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2971 $pos_1 += $len_1[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2972 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2973 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2974 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2975 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2976 # warn "$non_bisulfite_sequence_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2977 # position doesn't need adjusting
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2978 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2979 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2980 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2981 # we do not add any genomic sequence but only adjust the position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2982 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2983 $pos_1 += $len_1[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2984 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2985 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2986 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2987 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2988 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2989 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2990 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2991 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2992 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2993
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2994 ### 3' end of read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2995 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2996 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2997 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2998 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
2999 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3000 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3001 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3002 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3003
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3004
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3005 ### Extracting read 2 genomic sequence ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3006
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3007 ### 5' end of read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3008 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3009 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3010 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3011 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3012 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3013 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3014 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3015 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3016
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3017 foreach (0..$#len_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3018 if ($ops_2[$_] eq 'M'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3019 # extracting genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3020 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3021 # warn "$non_bisulfite_sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3022 # adjusting position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3023 $pos_2 += $len_2[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3024 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3025 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3026 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3027 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3028 # warn "$non_bisulfite_sequence_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3029 # position doesn't need adjusting
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3030 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3031 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3032 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3033 # we do not add any genomic sequence but only adjust the position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3034 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3035 $pos_2 += $len_2[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3036 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3037 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3038 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3039 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3040 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3041 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3042 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3043 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3044 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3045
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3046 ### 3' end of read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3047 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3048 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3049 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3050 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3051 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3052 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3053 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3054 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3055
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3056 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3057 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3058
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3059 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3060 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3061 ### [Index 0, sequence originated from (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3062 $counting{CT_GA_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3063 $alignment_read_1 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3064 $alignment_read_2 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3065 $read_conversion_info_1 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3066 $read_conversion_info_2 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3067 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3068 ### Read 1 is always the forward hit
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3069 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3070 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3071 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3072
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3073 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3074 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3075 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3076 $counting{GA_CT_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3077 $alignment_read_1 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3078 $alignment_read_2 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3079 $read_conversion_info_1 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3080 $read_conversion_info_2 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3081 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3082 ### Read 1 is always the forward hit
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3083 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3084 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3085 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3086
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3087 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3088 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3089 ### [Index 2, sequence originated from the complementary to (converted) top strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3090 $counting{GA_CT_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3091 $alignment_read_1 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3092 $alignment_read_2 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3093 $read_conversion_info_1 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3094 $read_conversion_info_2 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3095 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3096
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3097 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3098 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3099 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3100
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3101 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3102 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3103 ### [Index 3, sequence originated from the (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3104 $counting{CT_GA_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3105 $alignment_read_1 = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3106 $alignment_read_2 = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3107 $read_conversion_info_1 = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3108 $read_conversion_info_2 = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3109 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3110 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3111 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3112 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3113 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3114 die "Too many bowtie result filehandles\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3115 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3116 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3117 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3118
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3119 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3120 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3121 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3122 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3123 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3124 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3125 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3126 ## the end position of a read is stored in $pos
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3127 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3128 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3129 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3130 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3131 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3132
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3133 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3134 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3135 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3136
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3137 sub print_bisulfite_mapping_result_single_end{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3138 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3139
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3140 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3141 if ($phred64){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3142 $quality_value = convert_phred64_quals_to_phred33($quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3143 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3144 elsif ($solexa){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3145 $quality_value = convert_solexa_quals_to_phred33($quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3146 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3147
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3148 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3149 $methylation_call_params->{$identifier}->{position} += 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3150
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3151 ### writing every uniquely mapped read and its methylation call to the output file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3152 if ($vanilla){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3153 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3154 print OUT "$bowtie1_output\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3155 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3156 else{ # SAM output, default since Bismark v1.0.0
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3157 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3158 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3159 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3160
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3161 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3162 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3163 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3164
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3165 sub print_bisulfite_mapping_result_single_end_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3166 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3167
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3168 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3169 if ($phred64){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3170 $quality_value = convert_phred64_quals_to_phred33($quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3171 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3172 elsif ($solexa){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3173 $quality_value = convert_solexa_quals_to_phred33($quality_value);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3174 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3175
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3176 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3177 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3178 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3179
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3180 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3181 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3182 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3183
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3184 sub print_bisulfite_mapping_results_paired_ends{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3185 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3186
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3187 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3188 if ($phred64){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3189 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3190 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3191 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3192 elsif ($solexa){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3193 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3194 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3195 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3196
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3197 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3198 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3199
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3200 ### writing every single aligned read and its methylation call to the output file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3201 if ($vanilla){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3202 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3203 print OUT "$bowtie1_output_paired_end\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3204 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3205 else{ # SAM output, default since Bismark v1.0.0
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3206 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3207 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3208
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3209 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3210
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3211 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3212 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3213 ##########################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3214
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3215 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3216 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3217
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3218 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3219 if ($phred64){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3220 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3221 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3222 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3223 elsif ($solexa){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3224 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3225 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3226 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3227
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3228 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3229 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3230
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3231 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3232
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3233
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3234 sub convert_phred64_quals_to_phred33{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3235
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3236 my $qual = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3237 my @quals = split (//,$qual);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3238 my @new_quals;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3239
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3240 foreach my $index (0..$#quals){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3241 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3242 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3243 $new_quals[$index] = $phred33_quality_string;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3244 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3245
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3246 my $phred33_quality = join ("",@new_quals);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3247 return $phred33_quality;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3248 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3249
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3250 sub convert_solexa_quals_to_phred33{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3251
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3252 my $qual = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3253 my @quals = split (//,$qual);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3254 my @new_quals;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3255
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3256 foreach my $index (0..$#quals){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3257 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3258 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3259 $new_quals[$index] = $phred33_quality_string;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3260 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3261
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3262 my $phred33_quality = join ("",@new_quals);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3263 return $phred33_quality;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3264 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3265
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3266 sub convert_phred_score_into_phred33_quality_string{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3267 my $qual = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3268 $qual = chr($qual+33);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3269 return $qual;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3270 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3271
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3272 sub convert_phred64_quality_string_into_phred_score{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3273 my $string = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3274 my $qual = ord($string)-64;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3275 return $qual;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3276 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3277
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3278 sub convert_solexa_pre1_3_quality_string_into_phred_score{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3279 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3280 my $string = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3281 my $qual = ord($string)-59;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3282 return $qual;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3283 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3284
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3285
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3286 sub extract_corresponding_genomic_sequence_single_end {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3287 my ($sequence_identifier,$methylation_call_params) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3288 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3289 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3290
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3291 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3292 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3293 my $alignment_strand;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3294 my $read_conversion_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3295 my $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3296 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3297 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3298 ### if the C happens to be at the last position of the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3299 my $non_bisulfite_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3300 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3301
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3302 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3303 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3304 ### [Index 0, sequence originated from (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3305 $counting{CT_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3306 $alignment_strand = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3307 $read_conversion_info = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3308 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3309
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3310 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3311 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3312 ### + 2 extra base at the 3' end
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3313 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3314 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3315 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3316 $non_bisulfite_sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3317 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3318 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3319
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3320 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3321 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3322 ### [Index 1, sequence originated from (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3323 $counting{CT_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3324 $alignment_strand = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3325 $read_conversion_info = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3326 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3327
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3328 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3329 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3330 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3331 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3332 ## reverse complement!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3333 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3334 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3335 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3336 $non_bisulfite_sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3337 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3338 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3339
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3340 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3341 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3342 ### [Index 2, sequence originated from complementary to (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3343 $counting{GA_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3344 $alignment_strand = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3345 $read_conversion_info = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3346 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3347
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3348 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3349 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3350 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3351 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3352 ## reverse complement!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3353 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3354 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3355 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3356 $non_bisulfite_sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3357 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3358 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3359
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3360 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3361 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3362 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3363 $counting{GA_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3364 $alignment_strand = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3365 $read_conversion_info = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3366 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3367
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3368 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3369 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3370 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3371 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3372 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3373 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3374 $non_bisulfite_sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3375 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3376 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3377 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3378 die "Too many bowtie result filehandles\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3379 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3380
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3381 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3382 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3383 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3384 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3385
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3386 ### at this point we can also determine the end position of a read
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3387 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3388 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3389
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3390
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3391 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3392 my ($sequence_identifier,$methylation_call_params) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3393
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3394 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3395 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3396
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3397 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3398 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3399
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3400 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3401 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3402 my $alignment_strand;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3403 my $read_conversion_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3404 my $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3405 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3406 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3407 my $non_bisulfite_sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3408
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3409 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3410 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3411
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3412 # parsing CIGAR string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3413 my @len = split (/\D+/,$cigar); # storing the length per operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3414 my @ops = split (/\d+/,$cigar); # storing the operation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3415 shift @ops; # remove the empty first element
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3416 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3417
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3418 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3419 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3420 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3421 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3422 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3423 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3424 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3425 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3426 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3427 my $indels = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3428
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3429 foreach (0..$#len){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3430 if ($ops[$_] eq 'M'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3431 #extracting genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3432 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3433 # adjusting position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3434 $pos += $len[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3435 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3436 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3437 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3438 $non_bisulfite_sequence .= 'N' x $len[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3439 # warn "$non_bisulfite_sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3440 # position doesn't need to be adjusting
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3441 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3442 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3443 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3444 # we do not add any genomic sequence but only adjust the position
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3445 $pos += $len[$_];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3446 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3447 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3448 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3449 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3450 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3451 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3452 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3453 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3454 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3455
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3456 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3457 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3458 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3459 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3460 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3461 return;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3462 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3463 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3464 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3465 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3466
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3467
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3468
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3469 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3470 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3471 ### [Index 0, sequence originated from (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3472 $counting{CT_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3473 $alignment_strand = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3474 $read_conversion_info = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3475 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3476 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3477
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3478 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3479 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3480 ### [Index 1, sequence originated from (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3481 $counting{CT_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3482 $alignment_strand = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3483 $read_conversion_info = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3484 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3485
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3486 ### reverse complement!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3487 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3488 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3489
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3490 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3492 ### [Index 2, sequence originated from complementary to (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3493 $counting{GA_CT_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3494 $alignment_strand = '-';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3495 $read_conversion_info = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3496 $genome_conversion = 'CT';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3497
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3498 ### reverse complement!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3499 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3500 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3501
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3502 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3504 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3505 $counting{GA_GA_count}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3506 $alignment_strand = '+';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3507 $read_conversion_info = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3508 $genome_conversion = 'GA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3509
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3510 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3511 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3512 die "Too many Bowtie 2 result filehandles\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3513 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3514
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3515 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3516 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3517 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3518 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3519
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3520 ### the end position of a read is stored in $pos
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3521 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3522 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3523 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3524
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3525 ### METHYLATION CALL
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3526
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3527 sub methylation_call{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3528 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3529 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3530 my @seq = split(//,$sequence_actually_observed);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3531 my @genomic = split(//,$genomic_sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3532 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3533 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3534 ### CpG, CHH or CHG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3535
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3536 #################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3537 ### . for bases not involving cytosines ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3538 ### X for methylated C in CHG context (was protected) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3539 ### x for not methylated C in CHG context (was converted) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3540 ### H for methylated C in CHH context (was protected) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3541 ### h for not methylated C in CHH context (was converted) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3542 ### Z for methylated C in CpG context (was protected) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3543 ### z for not methylated C in CpG context (was converted) ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3544 #################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3545
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3546 my @match =();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3547 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3548 my $methyl_CHH_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3549 my $methyl_CHG_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3550 my $methyl_CpG_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3551 my $unmethylated_CHH_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3552 my $unmethylated_CHG_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3553 my $unmethylated_CpG_count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3554
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3555 if ($read_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3556 for my $index (0..$#seq) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3557 if ($seq[$index] eq $genomic[$index]) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3558 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3559 if ($genomic[$index] eq 'C') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3560 ### If the residue is a C we want to know if it was in CpG context or in any other context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3561 my $downstream_base = $genomic[$index+1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3562
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3563 if ($downstream_base eq 'G'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3564 ++$methyl_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3565 push @match,'Z'; # protected C, methylated, in CpG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3566 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3567
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3568 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3569 ### C in not in CpG-context, determining the second downstream base context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3570 my $second_downstream_base = $genomic[$index+2];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3571
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3572 if ($second_downstream_base eq 'G'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3573 ++$methyl_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3574 push @match,'X'; # protected C, methylated, in CHG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3575 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3576 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3577 ++$methyl_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3578 push @match,'H'; # protected C, methylated, in CHH context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3579 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3580 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3581 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3582 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3583 push @match, '.';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3584 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3585 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3586 elsif ($seq[$index] ne $genomic[$index]) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3587 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3588 ### in the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3589 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3590 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3591 my $downstream_base = $genomic[$index+1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3592
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3593 if ($downstream_base eq 'G'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3594 ++$unmethylated_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3595 push @match,'z'; # converted C, not methylated, in CpG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3596 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3597
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3598 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3599 ### C in not in CpG-context, determining the second downstream base context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3600 my $second_downstream_base = $genomic[$index+2];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3601
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3602 if ($second_downstream_base eq 'G'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3603 ++$unmethylated_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3604 push @match,'x'; # converted C, not methylated, in CHG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3605 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3606 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3607 ++$unmethylated_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3608 push @match,'h'; # converted C, not methylated, in CHH context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3609 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3610 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3611 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3612 ### all other mismatches are not of interest for a methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3613 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3614 push @match,'.';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3615 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3616 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3617 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3618 die "There can be only 2 possibilities\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3619 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3620 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3621 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3622 elsif ($read_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3623 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3624
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3625 for my $index (0..$#seq) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3626 if ($seq[$index] eq $genomic[$index+2]) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3627 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3628 if ($genomic[$index+2] eq 'G') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3629 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3630 ### to look if the base upstream is a C
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3631
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3632 my $upstream_base = $genomic[$index+1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3633
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3634 if ($upstream_base eq 'C'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3635 ++$methyl_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3636 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3637 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3638
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3639 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3640 ### C in not in CpG-context, determining the second upstream base context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3641 my $second_upstream_base = $genomic[$index];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3642
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3643 if ($second_upstream_base eq 'C'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3644 ++$methyl_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3645 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3646 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3647 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3648 ++$methyl_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3649 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3650 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3651 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3652 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3653 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3654 push @match, '.';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3655 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3656 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3657 elsif ($seq[$index] ne $genomic[$index+2]) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3658 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3659 ### on the opposing strand, so G to A conversions in the actually observed sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3660 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3661 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3662 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3663
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3664 my $upstream_base = $genomic[$index+1];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3665
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3666 if ($upstream_base eq 'C'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3667 ++$unmethylated_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3668 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3669 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3670
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3671 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3672 ### C in not in CpG-context, determining the second upstream base context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3673 my $second_upstream_base = $genomic[$index];
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3674
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3675 if ($second_upstream_base eq 'C'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3676 ++$unmethylated_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3677 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3678 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3679 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3680 ++$unmethylated_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3681 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3682 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3683 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3684 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3685 ### all other mismatches are not of interest for a methylation call
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3686 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3687 push @match,'.';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3688 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3689 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3690 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3691 die "There can be only 2 possibilities\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3692 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3693 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3694 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3695 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3696 die "Strand conversion info is required to perform a methylation call\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3697 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3698
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3699 my $methylation_call = join ("",@match);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3700
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3701 $counting{total_meCHH_count} += $methyl_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3702 $counting{total_meCHG_count} += $methyl_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3703 $counting{total_meCpG_count} += $methyl_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3704 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3705 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3706 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3707
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3708 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3709 return $methylation_call;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3710 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3711
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3712 sub read_genome_into_memory{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3713 ## working directoy
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3714 my $cwd = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3715 ## reading in and storing the specified genome in the %chromosomes hash
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3716 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3717 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3718
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3719 my @chromosome_filenames = <*.fa>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3720
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3721 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3722 unless (@chromosome_filenames){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3723 @chromosome_filenames = <*.fasta>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3724 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3725
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3726 unless (@chromosome_filenames){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3727 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3728 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3729
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3730 foreach my $chromosome_filename (@chromosome_filenames){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3731
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3732 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3733 ### first line needs to be a fastA header
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3734 my $first_line = <CHR_IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3735 chomp $first_line;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3736
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3737 ### Extracting chromosome name from the FastA header
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3738 my $chromosome_name = extract_chromosome_name($first_line);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3739
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3740 my $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3741 while (<CHR_IN>){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3742 chomp;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3743 if ($_ =~ /^>/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3744 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3745 if (exists $chromosomes{$chromosome_name}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3746 print "chr $chromosome_name (",length $sequence ," bp)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3747 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3748 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3749 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3750 if (length($sequence) == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3751 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3752 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3753 print "chr $chromosome_name (",length $sequence ," bp)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3754 $chromosomes{$chromosome_name} = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3755 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3756 ### resetting the sequence variable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3757 $sequence = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3758 ### setting new chromosome name
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3759 $chromosome_name = extract_chromosome_name($_);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3760 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3761 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3762 $sequence .= uc$_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3763 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3764 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3765
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3766 if (exists $chromosomes{$chromosome_name}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3767 print "chr $chromosome_name (",length $sequence ," bp)\t";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3768 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3769 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3770 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3771 if (length($sequence) == 0){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3772 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3773 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3774 print "chr $chromosome_name (",length $sequence ," bp)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3775 $chromosomes{$chromosome_name} = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3776 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3777 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3778 print "\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3779 chdir $cwd or die "Failed to move to directory $cwd\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3780 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3781
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3782 sub extract_chromosome_name {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3783 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3784 my $fasta_header = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3785 if ($fasta_header =~ s/^>//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3786 my ($chromosome_name) = split (/\s+/,$fasta_header);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3787 return $chromosome_name;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3788 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3789 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3790 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3791 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3792 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3793
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3794 sub reverse_complement{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3795 my $sequence = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3796 $sequence =~ tr/CATG/GTAC/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3797 $sequence = reverse($sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3798 return $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3799 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3800
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3801 sub biTransformFastAFiles {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3802 my $file = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3803 my ($dir,$filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3804 if ($file =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3805 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3806 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3807 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3808 $filename = $file;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3809 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3810
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3811 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3812 if ($file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3813 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3814 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3815 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3816 open (IN,$file) or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3817 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3818
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3819 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3820 warn "Skipping the first $skip reads from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3821 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3822 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3823 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3824 warn "Processing reads up to sequence no. $upto from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3825 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3826 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3827
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3828 my $C_to_T_infile = my $G_to_A_infile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3829 $C_to_T_infile =~ s/$/_C_to_T.fa/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3830 $G_to_A_infile =~ s/$/_G_to_A.fa/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3831 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3832 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3833
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3834 unless ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3835 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3836 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3837 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3838
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3839 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3840 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3841 my $header = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3842 my $sequence= <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3843 last unless ($header and $sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3844
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3845 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3846
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3847 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3848
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3849 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3850 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3851 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3852 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3853 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3854 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3855
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3856 $sequence = uc$sequence; # make input file case insensitive
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3857
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3858 # detecting if the input file contains tab stops, as this is likely to result in no alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3859 if (index($header,"\t") != -1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3860 $seqID_contains_tabs++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3861 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3862
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3863 ### small check if the sequence seems to be in FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3864 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3865
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3866 my $sequence_C_to_T = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3867 $sequence_C_to_T =~ tr/C/T/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3868 print CTOT "$header$sequence_C_to_T";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3869
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3870 unless ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3871 my $sequence_G_to_A = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3872 $sequence_G_to_A =~ tr/G/A/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3873 print GTOA "$header$sequence_G_to_A";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3874 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3875 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3876 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3877 print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3878 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3879 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3880 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3881 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3882 return ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3883 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3884
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3885 sub biTransformFastAFiles_paired_end {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3886 my ($file,$read_number) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3887
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3888 my ($dir,$filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3889 if ($file =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3890 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3891 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3892 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3893 $filename = $file;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3894 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3895
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3896 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3897 if ($file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3898 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3899 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3900 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3901 open (IN,$file) or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3902 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3903
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3904 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3905 warn "Skipping the first $skip reads from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3906 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3907 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3908 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3909 warn "Processing reads up to sequence no. $upto from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3910 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3911 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3912
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3913 my $C_to_T_infile = my $G_to_A_infile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3914 $C_to_T_infile =~ s/$/_C_to_T.fa/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3915 $G_to_A_infile =~ s/$/_G_to_A.fa/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3916
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3917 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3918 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3919 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3920 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3921 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3922 elsif ($read_number == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3923 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3924 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3925 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3926 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3927 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3928 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3929 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3930 else{ # all four strand output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3931 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3932 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3933 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3934 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3935 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3936
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3937 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3938
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3939 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3940 my $header = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3941 my $sequence= <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3942 last unless ($header and $sequence);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3943
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3944 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3945
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3946 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3947
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3948 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3949 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3950 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3951 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3952 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3953 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3954
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3955 $sequence = uc$sequence; # make input file case insensitive
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3956
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3957 # detecting if the input file contains tab stops, as this is likely to result in no alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3958 if (index($header,"\t") != -1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3959 $seqID_contains_tabs++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3960 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3961
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3962 ## small check if the sequence seems to be in FastA format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3963 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3964
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3965 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3966 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3967 $header =~ s/$/\/1\/1/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3968 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3969 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3970 $header =~ s/$/\/1/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3971 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3972 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3973 elsif ($read_number == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3974 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3975 $header =~ s/$/\/2\/2/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3976 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3977 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3978 $header =~ s/$/\/2/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3979 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3980 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3981 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3982 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3983 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3984 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3985
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3986 $sequence_C_to_T =~ tr/C/T/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3987 $sequence_G_to_A =~ tr/G/A/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3988
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3989 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3990
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3991 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3992 print CTOT "$header$sequence_C_to_T";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3993 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3994 elsif ($read_number == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3995 print GTOA "$header$sequence_G_to_A";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3996 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3997 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3998 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
3999 print CTOT "$header$sequence_C_to_T";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4000 print GTOA "$header$sequence_G_to_A";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4001 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4002 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4003
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4004 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4005 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4006 print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4007 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4008 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4009 print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4010 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4011 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4012 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4013 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4014 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4015
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4016 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4017 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4018 return ($C_to_T_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4019 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4020 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4021 return ($G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4022 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4023 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4024 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4025 return ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4026 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4027 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4028
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4029
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4030 sub biTransformFastQFiles {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4031 my $file = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4032 my ($dir,$filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4033 if ($file =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4034 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4035 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4036 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4037 $filename = $file;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4038 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4039
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4040 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4041 if ($file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4042 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4043 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4044 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4045 open (IN,$file) or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4046 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4047
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4048 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4049 warn "Skipping the first $skip reads from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4050 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4051 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4052 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4053 warn "Processing reads up to sequence no. $upto from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4054 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4055 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4056
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4057 my $C_to_T_infile = my $G_to_A_infile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4058
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4059 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4060 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4061 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4062
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4063 unless ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4064 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4065 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4066 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4067 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4068
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4069 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4070 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4071 my $identifier = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4072 my $sequence = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4073 my $identifier2 = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4074 my $quality_score = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4075 last unless ($identifier and $sequence and $identifier2 and $quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4076
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4077 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4078
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4079 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4080
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4081 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4082 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4083 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4084 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4085 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4086 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4087
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4088 $sequence = uc$sequence; # make input file case insensitive
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4089
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4090 # detecting if the input file contains tab stops, as this is likely to result in no alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4091 if (index($identifier,"\t") != -1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4092 $seqID_contains_tabs++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4093 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4094
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4095 ## small check if the sequence file appears to be a FastQ file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4096 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4097 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4098 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4099
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4100 my $sequence_C_to_T = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4101 $sequence_C_to_T =~ tr/C/T/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4102 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4103
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4104 unless ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4105 my $sequence_G_to_A = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4106 $sequence_G_to_A =~ tr/G/A/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4107 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4108 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4109 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4110
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4111 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4112 print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4113 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4114 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4115 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4116 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4117
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4118 return ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4119 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4120
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4121 sub biTransformFastQFiles_paired_end {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4122 my ($file,$read_number) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4123 my ($dir,$filename);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4124
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4125 if ($file =~ /\//){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4126 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4127 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4128 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4129 $filename = $file;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4130 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4131
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4132 ### gzipped version of the infile
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4133 if ($file =~ /\.gz$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4134 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4135 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4136 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4137 open (IN,$file) or die "Couldn't read from file $file: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4138 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4139
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4140 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4141 warn "Skipping the first $skip reads from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4142 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4143 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4144 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4145 warn "Processing reads up to sequence no. $upto from $file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4146 sleep (1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4147 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4148
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4149 my $C_to_T_infile = my $G_to_A_infile = $filename;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4150 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4151 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4152
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4153 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4154 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4155 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4156 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4157 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4158 elsif ($read_number == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4159 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4160 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4161 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4162 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4163 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4164 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4165 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4166 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4167 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4168 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4169 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4170 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4171 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4172
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4173 my $count = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4174
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4175 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4176 my $identifier = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4177 my $sequence = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4178 my $identifier2 = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4179 my $quality_score = <IN>;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4180 last unless ($identifier and $sequence and $identifier2 and $quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4181 ++$count;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4182
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4183 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4184
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4185 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4186 next unless ($count > $skip);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4187 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4188 if ($upto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4189 last if ($count > $upto);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4190 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4191
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4192 $sequence= uc$sequence; # make input file case insensitive
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4193
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4194 ## small check if the sequence file appears to be a FastQ file
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4195 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4196 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4197 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4198 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4199
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4200 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4201 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4202 $identifier =~ s/$/\/1\/1/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4203 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4204 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4205 $identifier =~ s/$/\/1/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4206 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4207 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4208 elsif ($read_number == 2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4209 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4210 $identifier =~ s/$/\/2\/2/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4211 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4212 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4213 $identifier =~ s/$/\/2/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4214 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4215 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4216 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4217 die "Read number needs to be 1 or 2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4218 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4219
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4220 $sequence_C_to_T =~ tr/C/T/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4221 $sequence_G_to_A =~ tr/G/A/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4222
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4223 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4224 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4225 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4226 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4227 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4228 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4229 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4230 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4231 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4232 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4233 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4234 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4235 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4236
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4237 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4238 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4239 print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4240 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4241 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4242 print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4243 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4244 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4245 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4246 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4247 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4248 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4249 if ($read_number == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4250 return ($C_to_T_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4251 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4252 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4253 return ($G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4254 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4255 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4256 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4257 return ($C_to_T_infile,$G_to_A_infile);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4258 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4259 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4260
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4261 sub fix_IDs{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4262 my $id = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4263 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4264 return $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4265 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4266
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4267 sub ensure_sensical_alignment_orientation_single_end{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4268 my $index = shift; # index number if the sequence produced an alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4269 my $strand = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4270 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4271 my $orientation = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4272 ##############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4273 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4274 ## here we only want reads in the forward (+) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4275 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4276 ### if the alignment is (+) we count it, and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4277 if ($strand eq '+') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4278 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4279 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4280 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4281 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4282 ### if the orientation equals (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4283 elsif ($strand eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4284 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4285 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4286 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4287 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4288 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4289 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4290 ## here we only want reads in the forward (-) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4291 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4292 ### if the alignment is (-) we count it and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4293 if ($strand eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4294 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4295 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4296 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4297 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4298 ### if the orientation equals (+) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4299 elsif ($strand eq '+') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4300 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4301 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4302 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4303 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4304 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4305 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4306 ## here we only want reads in the forward (-) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4307 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4308 ### if the alignment is (-) we count it and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4309 if ($strand eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4310 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4311 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4312 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4313 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4314 ### if the orientation equals (+) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4315 elsif ($strand eq '+') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4316 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4317 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4318 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4319 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4320 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4321 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4322 ## here we only want reads in the forward (+) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4323 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4324 ### if the alignment is (+) we count it and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4325 if ($strand eq '+') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4326 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4327 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4328 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4329 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4330 ### if the orientation equals (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4331 elsif ($strand eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4332 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4333 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4334 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4335 } else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4336 die "One of the above conditions must be true\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4337 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4338 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4339
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4340 sub ensure_sensical_alignment_orientation_paired_ends{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4341 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4342 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4343 my $orientation = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4344 ##############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4345 ## [Index 0, sequence originated from (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4346 ## CT converted read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4347 ## GA converted read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4348 ## CT converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4349 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4350 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4351 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4352 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4353 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4354 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4355 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4356 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4357 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4358 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4359 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4360 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4361 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4362 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4363 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4364 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4365 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4366 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4367 ## [Index 1, sequence originated from (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4368 ## GA converted read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4369 ## CT converted read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4370 ## GA converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4371 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4372 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4373 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4374 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4375 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4376 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4377 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4378 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4379 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4380 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4381 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4382 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4383 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4384 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4385 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4386 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4387 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4388 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4389 ## [Index 2, sequence originated from complementary to (converted) forward strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4390 ## GA converted read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4391 ## CT converted read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4392 ## CT converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4393 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4394 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4395 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4396 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4397 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4398 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4399 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4400 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4401 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4402 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4403 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4404 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4405 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4406 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4407 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4408 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4409 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4410 ###############################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4411 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4412 ## CT converted read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4413 ## GA converted read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4414 ## GA converted genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4415 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4416 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4417 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4418 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4419 $fhs[$index]->{seen}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4420 $orientation = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4421 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4422 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4423 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4424 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4425 $fhs[$index]->{wrong_strand}++;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4426 return $orientation;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4427 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4428 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4429 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4430 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4431 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4432 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4433 die "One of the above conditions must be true\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4434 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4435 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4436
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4437 #####################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4438
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4439 ### Bowtie 1 (default) | PAIRED-END | FASTA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4440
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4441 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4442
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4443 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4444
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4445 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4446 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4447 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4448 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4449 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4450 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4451
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4452 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4453 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4454 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4455 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4456 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4457 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4458 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4459 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4460
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4461 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4462
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4463 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4464 unless ($fh->{inputfile_1}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4465 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4466 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4467 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4468 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4469 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4470 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4471
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4472 my $bt_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4473 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4474 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4475 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4476 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4477 $bt_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4478 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4479
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4480 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4481 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4482
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4483 my $line_1 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4484 my $line_2 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4485
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4486 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4487 if ($line_1 and $line_2) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4488 chomp $line_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4489 chomp $line_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4490 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4491 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4492
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4493 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4494 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4495
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4496 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4497 $fh->{last_seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4498 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4499 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4500 $fh->{last_seq_id} = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4501 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4502 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4503 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4504 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4505
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4506 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4507 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4508 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4509 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4510 # otherwise we just initialise last_seq_id and last_lines as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4511 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4512 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4513 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4514 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4515 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4516 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4517 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4518 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4519
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4520 ### Bowtie 2 | PAIRED-END | FASTA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4521
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4522 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4523 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4524 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4525 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4526 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4527 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4528 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4529 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4530
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4531 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4532 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4533 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4534 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4535 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4536 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4537 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4538 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4539
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4540 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4541
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4542 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4543 unless ($fh->{inputfile_1}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4544 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4545 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4546 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4547 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4548 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4549 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4550
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4551 my $bt2_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4552 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4553 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4554 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4555 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4556 $bt2_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4557 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4558
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4559 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4560 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4561
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4562 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4563 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4564 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4565 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4566 last unless ($_ =~ /^\@/); # SAM headers start with @
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4567 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4568 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4569 last; # no alignment output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4570 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4571 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4572
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4573 my $line_1 = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4574 my $line_2 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4575
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4576 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4577 if ($line_1 and $line_2) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4578 chomp $line_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4579 chomp $line_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4580 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4581 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4582
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4583 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4584 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4585
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4586 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4587 $fh->{last_seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4588 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4589 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4590 $fh->{last_seq_id} = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4591 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4592 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4593 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4594 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4595
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4596 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4597 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4598 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4599 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4600 # otherwise we just initialise last_seq_id and last_lines as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4601 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4602 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4603 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4604 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4605 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4606 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4607 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4608 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4609
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4610 ### Bowtie 1 (default) | PAIRED-END | FASTQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4611
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4612 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4613 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4614 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4615 print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4616 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4617 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4618 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4619 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4620
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4621 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4622 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4623 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4624 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4625 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4626 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4627 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4628 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4629
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4630 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4631
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4632 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4633 unless ($fh->{inputfile_1}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4634 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4635 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4636 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4637 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4638 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4639 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4640
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4641 my $bt_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4642 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4643 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4644 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4645 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4646 $bt_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4647 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4648
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4649 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4650 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4651
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4652 my $line_1 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4653 my $line_2 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4654
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4655 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4656 if ($line_1 and $line_2) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4657 chomp $line_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4658 chomp $line_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4659 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4660 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4661
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4662 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4663 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4664
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4665 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4666 $fh->{last_seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4667 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4668 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4669 $fh->{last_seq_id} = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4670 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4671 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4672 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4673 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4674
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4675 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4676 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4677 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4678 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4679
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4680 # otherwise we just initialise last_seq_id and last_lines as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4681 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4682 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4683 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4684 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4685 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4686 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4687 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4688 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4689
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4690 ### Bowtie 2 | PAIRED-END | FASTQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4691
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4692 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4693 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4694 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4695 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4696 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4697 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4698 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4699 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4700
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4701 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4702 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4703 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4704 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4705 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4706 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4707 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4708 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4709
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4710 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4711
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4712 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4713 unless ($fh->{inputfile_1}){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4714 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4715 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4716 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4717 next;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4718 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4719 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4720
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4721 my $bt2_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4722 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4723 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4724 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4725 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4726 $bt2_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4727 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4728
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4729 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4730 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4731
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4732 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4733 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4734 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4735 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4736 last unless ($_ =~ /^\@/); # SAM headers start with @
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4737 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4738 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4739 last; # no alignment output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4740 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4741 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4742
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4743 my $line_1 = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4744 my $line_2 = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4745
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4746 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4747 if ($line_1 and $line_2) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4748 chomp $line_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4749 chomp $line_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4750 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4751 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4752
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4753 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4754 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4755
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4756 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4757 $fh->{last_seq_id} = $id_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4758 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4759 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4760 $fh->{last_seq_id} = $id_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4761 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4762 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4763 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4764 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4765
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4766 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4767 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4768 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4769 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4770
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4771 # otherwise we just initialise last_seq_id and last_lines as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4772 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4773 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4774 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4775 $fh->{last_line_1} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4776 $fh->{last_line_2} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4777 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4778 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4779 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4780
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4781 #####################################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4782
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4783 ### Bowtie 1 (default) | SINGLE-END | FASTA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4784 sub single_end_align_fragments_to_bisulfite_genome_fastA {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4785 my ($C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4786 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4787 print "Input file is $C_to_T_infile (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4788 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4789 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4790 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4791 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4792
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4793 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4794 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4795 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4796 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4797 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4798 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4799 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4800 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4801
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4802 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4803
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4804 my $bt_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4805 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4806 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4807 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4808 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4809 $bt_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4810 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4811
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4812 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4813 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4814
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4815 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4816 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4817 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4818 chomp;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4819 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4820 $fh->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4821 $fh->{last_line} = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4822 warn "Found first alignment:\t$fh->{last_line}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4823 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4824 # otherwise we just initialise last_seq_id and last_line as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4825 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4826 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4827 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4828 $fh->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4829 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4830 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4831 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4832
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4833 ### Bowtie 2 | SINGLE-END | FASTA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4834 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4835 my ($C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4836 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4837 print "Input file is $C_to_T_infile (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4838 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4839 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4840 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4841 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4842
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4843 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4844 ## data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4845 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4846 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4847 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4848 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4849 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4850 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4851
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4852 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4853
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4854 my $bt2_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4855 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4856 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4857 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4858 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4859 $bt2_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4860 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4861
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4862 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4863 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4864
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4865 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4866 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4867 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4868 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4869 last unless ($_ =~ /^\@/); # SAM headers start with @
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4870 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4871 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4872 last; # no alignment output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4873 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4874 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4875
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4876 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4877 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4878 chomp;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4879 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4880 $fh->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4881 $fh->{last_line} = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4882 warn "Found first alignment:\t$fh->{last_line}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4883 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4884 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4885 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4886 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4887 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4888 $fh->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4889 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4890 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4891 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4892
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4893
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4894 ### Bowtie 1 (default) | SINGLE-END | FASTQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4895 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4896 my ($C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4897 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4898 print "Input file is $C_to_T_infile (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4899 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4900 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4901 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4902 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4903
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4904 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4905 ## the data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4906 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4907 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4908 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4909 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4910 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4911 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4912
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4913 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4914 my $bt_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4915 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4916 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4917 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4918 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4919 $bt_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4920 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4921
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4922 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4923 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4924
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4925 # if Bowtie produces an alignment we store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4926 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4927 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4928 chomp;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4929 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4930 $fh->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4931 $fh->{last_line} = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4932 warn "Found first alignment:\t$fh->{last_line}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4933 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4934 # otherwise we just initialise last_seq_id and last_line as undefined
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4935 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4936 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4937 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4938 $fh->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4939 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4940 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4941 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4942
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4943 ### Bowtie 2 | SINGLE-END | FASTQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4944 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4945 my ($C_to_T_infile,$G_to_A_infile) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4946 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4947 print "Input file is $C_to_T_infile (FastQ)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4948 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4949 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4950 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4951 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4952
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4953 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4954 ## the data structure above
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4955 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4956 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4957 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4958 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4959 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4960 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4961
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4962 foreach my $fh (@fhs) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4963 my $bt2_options = $bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4964 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4965 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4966 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4967 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4968 $bt2_options .= ' --nofw';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4969 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4970 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4971 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4972
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4973 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4974 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4975 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4976 $_ = $fh->{fh}->getline();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4977 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4978 last unless ($_ =~ /^\@/); # SAM headers start with @
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4979 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4980 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4981 last;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4982 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4983 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4984
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4985 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4986 if ($_) {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4987 chomp;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4988 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4989 $fh->{last_seq_id} = $id;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4990 $fh->{last_line} = $_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4991 warn "Found first alignment:\t$fh->{last_line}\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4992 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4993 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4994 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4995 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4996 $fh->{last_seq_id} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4997 $fh->{last_line} = undef;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4998 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
4999 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5000 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5001
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5002 ###########################################################################################################################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5003
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5004 sub reset_counters_and_fhs{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5005 my $filename = shift;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5006 %counting=(
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5007 total_meCHH_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5008 total_meCHG_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5009 total_meCpG_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5010 total_unmethylated_CHH_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5011 total_unmethylated_CHG_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5012 total_unmethylated_CpG_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5013 sequences_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5014 no_single_alignment_found => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5015 unsuitable_sequence_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5016 genomic_sequence_could_not_be_extracted_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5017 unique_best_alignment_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5018 low_complexity_alignments_overruled_count => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5019 CT_CT_count => 0, #(CT read/CT genome, original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5020 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5021 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5022 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5023 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5024 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5025 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5026 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5027 alignments_rejected_count => 0, # only relevant if --directional was specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5028 );
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5029
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5030 if ($directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5031 if ($filename =~ ','){ # paired-end files
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5032 @fhs=(
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5033 { name => 'CTreadCTgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5034 strand_identity => 'con ori forward',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5035 bisulfiteIndex => $CT_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5036 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5037 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5038 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5039 { name => 'CTreadGAgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5040 strand_identity => 'con ori reverse',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5041 bisulfiteIndex => $GA_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5042 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5043 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5044 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5045 { name => 'GAreadCTgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5046 strand_identity => 'compl ori con forward',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5047 bisulfiteIndex => $CT_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5048 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5049 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5050 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5051 { name => 'GAreadGAgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5052 strand_identity => 'compl ori con reverse',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5053 bisulfiteIndex => $GA_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5054 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5055 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5056 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5057 );
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5058 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5059 else{ # single-end files
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5060 @fhs=(
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5061 { name => 'CTreadCTgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5062 strand_identity => 'con ori forward',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5063 bisulfiteIndex => $CT_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5064 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5065 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5066 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5067 { name => 'CTreadGAgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5068 strand_identity => 'con ori reverse',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5069 bisulfiteIndex => $GA_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5070 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5071 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5072 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5073 );
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5074 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5075 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5076 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5077 @fhs=(
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5078 { name => 'CTreadCTgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5079 strand_identity => 'con ori forward',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5080 bisulfiteIndex => $CT_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5081 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5082 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5083 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5084 { name => 'CTreadGAgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5085 strand_identity => 'con ori reverse',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5086 bisulfiteIndex => $GA_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5087 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5088 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5089 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5090 { name => 'GAreadCTgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5091 strand_identity => 'compl ori con forward',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5092 bisulfiteIndex => $CT_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5093 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5094 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5095 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5096 { name => 'GAreadGAgenome',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5097 strand_identity => 'compl ori con reverse',
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5098 bisulfiteIndex => $GA_index_basename,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5099 seen => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5100 wrong_strand => 0,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5101 },
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5102 );
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5103 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5104 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5105
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5106
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5107 sub process_command_line{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5108 my @bowtie_options;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5109 my $help;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5110 my $mates1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5111 my $mates2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5112 my $path_to_bowtie;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5113 my $fastq;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5114 my $fasta;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5115 my $skip;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5116 my $qupto;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5117 my $phred64;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5118 my $phred33;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5119 my $solexa;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5120 my $mismatches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5121 my $seed_length;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5122 my $best;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5123 my $sequence_format;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5124 my $version;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5125 my $quiet;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5126 my $chunk;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5127 my $non_directional;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5128 my $ceiling;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5129 my $maxins;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5130 my $minins;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5131 my $unmapped;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5132 my $multi_map;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5133 my $output_dir;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5134 my $bowtie2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5135 my $vanilla;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5136 my $sam_no_hd;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5137 my $seed_extension_fails;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5138 my $reseed_repetitive_seeds;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5139 my $most_valid_alignments;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5140 my $score_min;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5141 my $parallel;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5142 my $temp_dir;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5143
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5144 my $command_line = GetOptions ('help|man' => \$help,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5145 '1=s' => \$mates1,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5146 '2=s' => \$mates2,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5147 'path_to_bowtie=s' => \$path_to_bowtie,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5148 'f|fasta' => \$fasta,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5149 'q|fastq' => \$fastq,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5150 's|skip=i' => \$skip,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5151 'u|upto=i' => \$qupto,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5152 'phred33-quals' => \$phred33,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5153 'phred64-quals|solexa1' => \$phred64,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5154 'solexa-quals' => \$solexa,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5155 'n|seedmms=i' => \$mismatches,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5156 'l|seedlen=i' => \$seed_length,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5157 'no_best' => \$best,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5158 'version' => \$version,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5159 'quiet' => \$quiet,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5160 'chunkmbs=i' => \$chunk,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5161 'non_directional' => \$non_directional,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5162 'I|minins=i' => \$minins,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5163 'X|maxins=i' => \$maxins,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5164 'e|maqerr=i' => \$ceiling,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5165 'un|unmapped' => \$unmapped,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5166 'ambiguous' => \$multi_map,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5167 'o|output_dir=s' => \$output_dir,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5168 'bowtie2' => \$bowtie2,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5169 'vanilla' => \$vanilla,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5170 'sam-no-hd' => \$sam_no_hd,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5171 'D=i' => \$seed_extension_fails,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5172 'R=i' => \$reseed_repetitive_seeds,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5173 'score_min=s' => \$score_min,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5174 'most_valid_alignments=i' => \$most_valid_alignments,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5175 'p=i' => \$parallel,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5176 'temp_dir=s' => \$temp_dir,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5177 );
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5178
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5179
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5180 ### EXIT ON ERROR if there were errors with any of the supplied options
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5181 unless ($command_line){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5182 die "Please respecify command line options\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5183 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5184 ### HELPFILE
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5185 if ($help){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5186 print_helpfile();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5187 exit;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5188 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5189 if ($version){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5190 print << "VERSION";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5191
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5192
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5193 Bismark - Bisulfite Mapper and Methylation Caller.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5194
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5195 Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5196 www.bioinformatics.babraham.ac.uk/projects/
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5197
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5198
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5199 VERSION
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5200 exit;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5201 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5202
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5203
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5204 ##########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5205 ### PROCESSING OPTIONS ###
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5206 ##########################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5207
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5208 unless ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5209 $bowtie2 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5210 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5211 unless ($sam_no_hd){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5212 $sam_no_hd =0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5213 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5214
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5215 ### PATH TO BOWTIE
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5216 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5217 if ($path_to_bowtie){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5218 unless ($path_to_bowtie =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5219 $path_to_bowtie =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5220 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5221 if (-d $path_to_bowtie){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5222 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5223 $path_to_bowtie = "${path_to_bowtie}bowtie2";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5224 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5225 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5226 $path_to_bowtie = "${path_to_bowtie}bowtie";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5227 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5228 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5229 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5230 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5231 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5232 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5233 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5234 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5235 $path_to_bowtie = 'bowtie2';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5236 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5237 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5238 $path_to_bowtie = 'bowtie';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5239 warn "Path to Bowtie specified as: $path_to_bowtie\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5240 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5241 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5242
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5243 ####################################
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5244 ### PROCESSING ARGUMENTS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5245
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5246 ### GENOME FOLDER
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5247 my $genome_folder = shift @ARGV; # mandatory
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5248 unless ($genome_folder){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5249 warn "Genome folder was not specified!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5250 print_helpfile();
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5251 exit;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5252 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5253
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5254 ### checking that the genome folder, all subfolders and the required bowtie index files exist
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5255 unless ($genome_folder =~/\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5256 $genome_folder =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5257 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5258
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5259 if (chdir $genome_folder){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5260 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5261 unless ($absolute_genome_folder =~/\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5262 $absolute_genome_folder =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5263 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5264 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5265 $genome_folder = $absolute_genome_folder;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5266 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5267 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5268 die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5269 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5270
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5271 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5272 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5273
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5274 if ($bowtie2){ ### Bowtie 2 (new)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5275 ### checking the integrity of $CT_dir
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5276 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5277 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5278 foreach my $file(@CT_bowtie_index){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5279 unless (-f $file){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5280 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5281 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5282 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5283 ### checking the integrity of $GA_dir
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5284 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5285 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5286 foreach my $file(@GA_bowtie_index){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5287 unless (-f $file){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5288 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5289 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5290 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5291 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5292
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5293 else{ ### Bowtie 1 (default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5294 ### checking the integrity of $CT_dir
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5295 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5296 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5297 foreach my $file(@CT_bowtie_index){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5298 unless (-f $file){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5299 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5300 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5301 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5302 ### checking the integrity of $GA_dir
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5303 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5304 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5305 foreach my $file(@GA_bowtie_index){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5306 unless (-f $file){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5307 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5308 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5309 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5310 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5311
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5312 my $CT_index_basename = "${CT_dir}BS_CT";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5313 my $GA_index_basename = "${GA_dir}BS_GA";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5314
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5315 ### INPUT OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5316
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5317 ### SEQUENCE FILE FORMAT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5318 ### exits if both fastA and FastQ were specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5319 if ($fasta and $fastq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5320 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5321 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5322
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5323 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5324 if ($fasta){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5325 print "FastA format specified\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5326 $sequence_format = 'FASTA';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5327 push @bowtie_options, '-f';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5328 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5329 elsif ($fastq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5330 print "FastQ format specified\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5331 $sequence_format = 'FASTQ';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5332 push @bowtie_options, '-q';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5333 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5334 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5335 $fastq = 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5336 print "FastQ format assumed (by default)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5337 $sequence_format = 'FASTQ';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5338 push @bowtie_options, '-q';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5339 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5340
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5341 ### SKIP
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5342 if ($skip){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5343 warn "Skipping the first $skip reads from the input file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5344 # push @bowtie_options,"-s $skip";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5345 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5346
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5347 ### UPTO
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5348 if ($qupto){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5349 warn "Processing sequences up to read no. $qupto from the input file\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5350 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5351 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5352 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5353 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5354 # push @bowtie_options,"--qupto $qupto";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5355 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5356 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5357
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5358 ### QUALITY VALUES
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5359 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5360 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5361 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5362 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5363 # Phred quality values work only when -q is specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5364 unless ($fastq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5365 die "Phred quality values works only when -q (FASTQ) is specified\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5366 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5367 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5368 push @bowtie_options,"--phred33";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5369 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5370 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5371 push @bowtie_options,"--phred33-quals";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5372 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5373 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5374 if ($phred64){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5375 # Phred quality values work only when -q is specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5376 unless ($fastq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5377 die "Phred quality values work only when -q (FASTQ) is specified\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5378 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5379 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5380 push @bowtie_options,"--phred64";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5381 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5382 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5383 push @bowtie_options,"--phred64-quals";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5384 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5385 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5386 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5387 $phred64 = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5388 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5389
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5390 if ($solexa){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5391 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5392 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5393 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5394 # Solexa to Phred value conversion works only when -q is specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5395 unless ($fastq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5396 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5397 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5398 push @bowtie_options,"--solexa-quals";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5399 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5400 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5401 $solexa = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5402 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5403
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5404 ### ALIGNMENT OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5405
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5406 ### MISMATCHES
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5407 if (defined $mismatches){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5408 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5409 if ($mismatches == 0 or $mismatches == 1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5410 push @bowtie_options,"-N $mismatches";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5411 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5412 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5413 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5414 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5415 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5416 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5417 if ($mismatches >= 0 and $mismatches <= 3){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5418 push @bowtie_options,"-n $mismatches";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5419 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5420 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5421 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5422 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5423 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5424 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5425 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5426 unless ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5427 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5428 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5429 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5430
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5431 ### SEED LENGTH
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5432 if (defined $seed_length){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5433 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5434 push @bowtie_options,"-L $seed_length";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5435 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5436 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5437 push @bowtie_options,"-l $seed_length";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5438 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5439 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5440
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5441 ### MISMATCH CEILING
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5442 if (defined $ceiling){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5443 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5444 push @bowtie_options,"-e $ceiling";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5445 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5446
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5447
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5448 ### BOWTIE 2 EFFORT OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5449
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5450 ### CONSECUTIVE SEED EXTENSION FAILS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5451 if (defined $seed_extension_fails){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5452 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5453 push @bowtie_options,"-D $seed_extension_fails";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5454 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5455
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5456 ### RE-SEEDING REPETITIVE SEEDS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5457 if (defined $reseed_repetitive_seeds){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5458 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5459 push @bowtie_options,"-R $reseed_repetitive_seeds";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5460 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5461
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5462
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5463 ### BOWTIE 2 SCORING OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5464 if ($score_min){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5465 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5466 unless ($score_min =~ /^L,.+,.+$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5467 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5468 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5469 push @bowtie_options,"--score-min $score_min";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5470 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5471 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5472 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5473 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5474 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5475 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5476
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5477 ### BOWTIE 2 PARALLELIZATION OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5478 if (defined $parallel){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5479 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5480 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5481 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5482 if ($parallel){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5483 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5484 push @bowtie_options,"-p $parallel";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5485 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5486 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5487 sleep (2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5488 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5489 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5490
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5491 ### REPORTING OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5492
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5493 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5494 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5495
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5496 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5497 if(defined $most_valid_alignments){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5498
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5499 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5500 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5501 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5502 # else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5503 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5504 # }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5505 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5506 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5507 push @bowtie_options,'-k 2';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5508 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5509
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5510 ### --BEST
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5511 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5512 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5513 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5514 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5515 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5516 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5517 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5518 unless ($best){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5519 push @bowtie_options,'--best';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5520 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5521 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5522
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5523 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5524 if ($vanilla){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5525 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5526 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5527 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5528 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5529 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5530 $vanilla = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5531 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5532
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5533 ### PAIRED-END MAPPING
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5534 if ($mates1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5535 my @mates1 = (split (/,/,$mates1));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5536 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5537 my @mates2 = (split(/,/,$mates2));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5538 unless (scalar @mates1 == scalar @mates2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5539 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5540 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5541 while (1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5542 my $mate1 = shift @mates1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5543 my $mate2 = shift @mates2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5544 last unless ($mate1 and $mate2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5545 push @filenames,"$mate1,$mate2";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5546 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5547 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5548 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5549 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5550 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5551 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5552 elsif ($mates2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5553 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5554 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5555
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5556 ### SINGLE-END MAPPING
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5557 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5558 my $singles;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5559 unless ($mates1 and $mates2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5560 $singles = join (',',@ARGV);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5561 unless ($singles){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5562 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5563 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5564 $singles =~ s/\s/,/g;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5565 @filenames = (split(/,/,$singles));
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5566 warn "\nFiles to be analysed:\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5567 warn "@filenames\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5568 sleep (3);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5569 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5570
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5571 ### MININUM INSERT SIZE (PAIRED-END ONLY)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5572 if (defined $minins){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5573 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5574 push @bowtie_options,"--minins $minins";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5575 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5576
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5577 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5578 if (defined $maxins){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5579 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5580 push @bowtie_options,"--maxins $maxins";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5581 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5582 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5583 unless ($singles){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5584 push @bowtie_options,'--maxins 500';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5585 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5586 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5587
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5588 ### QUIET prints nothing besides alignments (suppresses warnings)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5589 if ($quiet){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5590 push @bowtie_options,'--quiet';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5591 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5592
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5593 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5594 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5595 if (defined $chunk){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5596 push @bowtie_options,"--chunkmbs $chunk";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5597 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5598 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5599 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5600 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5601 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5602
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5603
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5604 ### SUMMARY OF ALL BOWTIE OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5605 my $bowtie_options = join (' ',@bowtie_options);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5606
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5607
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5608 ### STRAND-SPECIFIC LIBRARIES
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5609 my $directional;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5610 if ($non_directional){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5611 print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5612 sleep (3);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5613 $directional = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5614 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5615 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5616 print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5617 sleep (3);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5618 $directional = 1; # Changed this to being the default behaviour
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5619 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5620
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5621 ### UNMAPPED SEQUENCE OUTPUT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5622 $unmapped = 0 unless ($unmapped);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5623
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5624 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5625 $multi_map = 0 unless ($multi_map);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5626
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5627
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5628 ### OUTPUT DIRECTORY
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5629
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5630 chdir $parent_dir or die "Failed to move back to current working directory\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5631 if ($output_dir){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5632 unless ($output_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5633 $output_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5634 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5635
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5636 if (chdir $output_dir){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5637 $output_dir = getcwd; # making the path absolute
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5638 unless ($output_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5639 $output_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5640 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5641 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5642 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5643 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5644 warn "Created output directory $output_dir!\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5645 chdir $output_dir or die "Failed to move to $output_dir\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5646 $output_dir = getcwd; # making the path absolute
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5647 unless ($output_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5648 $output_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5649 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5650 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5651 warn "Output will be written into the directory: $output_dir\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5652 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5653 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5654 $output_dir = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5655 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5656
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5657 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5658
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5659 chdir $parent_dir or die "Failed to move back to current working directory\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5660 if ($temp_dir){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5661 warn "\nUsing temp directory: $temp_dir\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5662 unless ($temp_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5663 $temp_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5664 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5665
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5666 if (chdir $temp_dir){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5667 $temp_dir = getcwd; # making the path absolute
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5668 unless ($temp_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5669 $temp_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5670 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5671 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5672 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5673 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5674 warn "Created temporary directory $temp_dir!\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5675 chdir $temp_dir or die "Failed to move to $temp_dir\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5676 $temp_dir = getcwd; # making the path absolute
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5677 unless ($temp_dir =~ /\/$/){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5678 $temp_dir =~ s/$/\//;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5679 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5680 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5681 warn "Temporary files will be written into the directory: $temp_dir\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5682 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5683 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5684 $temp_dir = '';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5685 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5686
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5687
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5688 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5689 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5690
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5691
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5692
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5693 sub generate_SAM_header{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5694 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5695 foreach my $chr (keys %chromosomes){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5696 my $length = length ($chromosomes{$chr});
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5697 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5698 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5699 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5700 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5701
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5702 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5703 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5704
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5705 sub single_end_SAM_output{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5706 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5707 my $strand = $methylation_call_params->{$id}->{alignment_strand};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5708 my $chr = $methylation_call_params->{$id}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5709 my $start = $methylation_call_params->{$id}->{position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5710 my $stop = $methylation_call_params->{$id}->{end_position};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5711 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5712 my $methcall = $methylation_call_params->{$id}->{methylation_call};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5713 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5714 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5715 my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5716 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5717 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5718 ## Bit Description Comment Value
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5719 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5720 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5721 ## 0x4 segment unmapped --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5722 ## 0x8 next segment in the template unmapped --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5723 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5724 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5725 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5726 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5727 ## 0x100 secondary alignment --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5728 ## 0x200 not passing quality controls --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5729 ## 0x400 PCR or optical duplicate --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5730
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5731 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5732
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5733 my $flag; # FLAG variable used for SAM format.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5734 if ($strand eq "+"){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5735 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5736 $flag = 0; # 0 for "+" strand (OT)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5737 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5738 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5739 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5740 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5741 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5742 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5743 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5744 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5745 elsif ($strand eq "-"){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5746 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5747 $flag = 16; # 16 for "-" strand (OB)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5748 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5749 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5750 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5751 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5752 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5753 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5754 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5755 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5756 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5757 die "Unexpected strand information: $strand\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5758 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5759
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5760 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5761
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5762 my $mapq = 255; # Assume mapping quality is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5763
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5764 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5765
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5766 my $cigar;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5767 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5768 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5769 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5770 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5771 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5772 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5773
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5774 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5775
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5776 my $rnext = "*"; # Paired-end variable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5777
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5778 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5779
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5780 my $pnext = 0; # Paired-end variable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5781
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5782 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5783
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5784 my $tlen = 0; # Paired-end variable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5785
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5786 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5787
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5788 if ($read_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5789 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5790 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5791 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5792 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5793 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5794
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5795 if ($strand eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5796 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5797 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5798 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5799 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5800
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5801 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5802
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5803 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5804 # into the reference string. hemming_dist()
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5805 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5806 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5807 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5808
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5809 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5810
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5811 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5812
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5813 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5814
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5815 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5816
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5817 my $XM_tag; # Optional tag XM: Methylation Call String
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5818 if ($strand eq '+'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5819 $XM_tag = "XM:Z:$methcall";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5820 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5821 elsif ($strand eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5822 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5823 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5824
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5825 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5826
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5827 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5828
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5829 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5830
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5831 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5832
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5833 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5834
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5835 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5836 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5837 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5838
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5839
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5840 sub paired_end_SAM_output{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5841 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5842 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5843 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5844 my $chr = $methylation_call_params->{$id}->{chromosome};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5845 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5846 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5847 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5848 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5849 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5850 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5851 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5852 my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5853 my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5854
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5855 my $id_1 = $id.'/1';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5856 my $id_2 = $id.'/2';
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5857
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5858 # Allows all degenerate nucleotide sequences in reference genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5859 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5860 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5861
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5862 my $index; # used to store the srand origin of the alignment in a less convoluted way
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5863
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5864 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5865 $index = 0; ## this is OT (original top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5866 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5867 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5868 $index = 1; ## this is CTOB (complementary to OB)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5869 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5870 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5871 $index = 2; ## this is CTOT (complementary to OT)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5872 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5873 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5874 $index = 3; ## this is OB (original bottom)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5875 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5876 else {
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5877 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5878 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5879
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5880 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5881 ### first or last position.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5882
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5883 if ($index == 0 or $index == 3){ # OT or OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5884 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5885 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5886 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5887 else{ # CTOT or CTOB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5888 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5889 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5890 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5891
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5892 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5893
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5894 my $start_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5895 my $start_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5896 # adjusting end positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5897
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5898 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5899 $start_read_1 = $methylation_call_params->{$id}->{position_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5900 $start_read_2 = $methylation_call_params->{$id}->{position_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5901 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5902 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5903 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5904 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5905 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5906 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5907 else{ # read 1 is on the - strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5908 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5909 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5910 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5911 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5912
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5913 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5914
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5915 my $end_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5916 my $end_read_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5917 # adjusting end positions
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5918
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5919 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5920 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5921 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5922 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5923 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5924 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5925 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5926 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5927 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5928 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5929 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5930 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5931 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5932 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5933
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5934 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5935
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5936 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5937 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5938 ## Bit Description Comment Value
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5939 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5940 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5941 ## 0x4 segment unmapped --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5942 ## 0x8 next segment in the template unmapped --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5943 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5944 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5945 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5946 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5947 ## 0x100 secondary alignment --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5948 ## 0x200 not passing quality controls --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5949 ## 0x400 PCR or optical duplicate --- ---
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5950
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5951 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5952
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5953 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5954 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5955
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5956 my $flag_1; # FLAG variable used for SAM format
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5957 my $flag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5958
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5959 if ($index == 0){ # OT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5960 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5961 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5962 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5963 elsif ($index == 1){ # CTOB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5964 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5965 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5966 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5967 elsif ($index == 2){ # CTOT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5968 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5969 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5970 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5971 elsif ($index == 3){ # OB
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5972 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5973 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5974 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5975
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5976 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5977
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5978 my $mapq = 255; # Mapping quality is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5979
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5980 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5981
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5982 my $cigar_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5983 my $cigar_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5984
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5985 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5986 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5987 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5988 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5989 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5990 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5991 $cigar_2 = length($actual_seq_2) . "M";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5992 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5993
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5994 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5995
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5996 my $rnext = '='; # Chromosome of mate; applies to both reads
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5997
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5998 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
5999
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6000 my $pnext_1 = $start_read_2; # Leftmost position of mate
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6001 my $pnext_2 = $start_read_1;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6002
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6003 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6004
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6005 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6006 my $tlen_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6007
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6008 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6009
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6010 if ($start_read_1 <= $start_read_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6011
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6012 # Read 1 alignment is leftmost
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6013
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6014 if ($end_read_2 >= $end_read_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6015
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6016 # -------------------------> read 1 reads overlapping
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6017 # <------------------------- read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6018 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6019 # or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6020 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6021 # -------------------------> read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6022 # <----------------------- read 2 read 2 contained within read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6023 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6024 # or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6025 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6026 # -------------------------> read 1 reads 1 and 2 exactly overlapping
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6027 # <------------------------- read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6028 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6029
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6030 # dovetailing of reads is not enabled for Bowtie 2 alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6031
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6032 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6033 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6034 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6035 elsif ($end_read_2 < $end_read_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6036
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6037 # -------------------------> read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6038 # <----------- read 2 read 2 contained within read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6039 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6040 # or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6041 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6042 # -------------------------> read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6043 # <----------- read 2 read 2 contained within read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6044
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6045 # start and end of read 2 are fully contained within read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6046 $tlen_1 = 0; # Set as 0 when the information is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6047 $tlen_2 = 0; # Set as 0 when the information is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6048 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6049
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6050 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6051
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6052 elsif ($start_read_2 < $start_read_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6053
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6054 if ($end_read_1 >= $end_read_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6055
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6056 # Read 2 alignment is leftmost
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6057
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6058 # -------------------------> read 2 reads overlapping
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6059 # <------------------------- read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6060 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6061 # or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6062 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6063 # -------------------------> read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6064 # <----------------------- read 1 read 1 contained within read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6065 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6066 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6067
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6068 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6069 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6070 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6071 elsif ($end_read_1 < $end_read_2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6072
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6073 # -------------------------> read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6074 # <----------- read 1 read 1 contained within read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6075 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6076 # or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6077 #
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6078 # -------------------------> read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6079 # <----------- read 1 read 1 contained within read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6080
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6081 # start and end of read 1 are fully contained within read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6082 $tlen_1 = 0; # Set as 0 when the information is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6083 $tlen_2 = 0; # Set as 0 when the information is unavailable
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6084 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6085 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6086 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6087
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6088 else{ # Bowtie 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6089
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6090 if ($end_read_2 >= $end_read_1){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6091 # Read 1 alignment is leftmost
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6092 # -------------------------> read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6093 # <------------------------- read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6094 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6095
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6096 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6097 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6098 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6099 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6100 # Read 2 alignment is leftmost
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6101 # -------------------------> read 2
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6102 # <------------------------- read 1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6103 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6104
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6105 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6106 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6107 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6108 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6109
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6110 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6111
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6112 # adjusting the strand of the sequence before we use them to generate mismatch strings
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6113 if ($strand_1 eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6114 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6115 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6116 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6117 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6118 if ($strand_2 eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6119 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6120 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6121 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6122 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6123
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6124 # print "$actual_seq_1\n$ref_seq_1\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6125 # print "$actual_seq_2\n$ref_seq_2\n\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6126
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6127 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6128
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6129 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6130 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6131 if ($bowtie2){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6132 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6133 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6134 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6135 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6136 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6137
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6138 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6139
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6140 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6141 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6142
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6143 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6144
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6145 my $XM_tag_1; # Optional tag XM: Methylation call string
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6146 my $XM_tag_2;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6147
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6148 if ($strand_1 eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6149 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6150 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6151 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6152 $XM_tag_1 = "XM:Z:$methcall_1";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6153 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6154
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6155 if ($strand_2 eq '-'){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6156 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6157 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6158 else{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6159 $XM_tag_2 = "XM:Z:$methcall_2";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6160 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6161
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6162 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6163
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6164 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6165 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6166
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6167 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6168
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6169 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6170
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6171 #####
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6172
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6173 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6174 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6175 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6176 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6177
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6178 sub revcomp{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6179 my $seq = shift or die "Missing seq to reverse complement\n";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6180 $seq = reverse $seq;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6181 $seq =~ tr/ACTGactg/TGACTGAC/;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6182 return $seq;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6183 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6184
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6185 sub hemming_dist{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6186 my $matches = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6187 my @actual_seq = split //,(shift @_);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6188 my @ref_seq = split //,(shift @_);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6189 foreach (0..$#actual_seq){
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6190 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6191 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6192 return my $hd = scalar @actual_seq - $matches;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6193 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6194
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6195 sub make_mismatch_string{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6196 my $actual_seq = shift or die "Missing actual sequence";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6197 my $ref_seq = shift or die "Missing reference sequence";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6198 my $XX_tag = "XX:Z:";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6199 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6200 my $prev_mm_pos = 0;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6201 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6202 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6203 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6204 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6205 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6206 $prev_mm_pos = pos($tmp); # Position of last mismatch
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6207 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6208 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6209 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6210 return $XX_tag;
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6211 }
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6212
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6213
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6214
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6215 sub print_helpfile{
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6216 print << "HOW_TO";
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6217
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6218
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6219 This program is free software: you can redistribute it and/or modify
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6220 it under the terms of the GNU General Public License as published by
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6221 the Free Software Foundation, either version 3 of the License, or
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6222 (at your option) any later version.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6223
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6224 This program is distributed in the hope that it will be useful,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6225 but WITHOUT ANY WARRANTY; without even the implied warranty of
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6226 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6227 GNU General Public License for more details.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6228 You should have received a copy of the GNU General Public License
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6229 along with this program. If not, see <http://www.gnu.org/licenses/>.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6230
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6231
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6232
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6233 DESCRIPTION
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6234
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6235
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6236 The following is a brief description of command line options and arguments to control the Bismark
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6237 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6238 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6239 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6240 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6241 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6242 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6243 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6244 sequence from the genome and determine if there were any protected C's present or not.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6245
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6246 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6247 re-enabled by using --non_directional.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6248
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6249 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6250 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6251 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6252
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6253
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6254 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6255
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6256
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6257 ARGUMENTS:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6258
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6259 <genome_folder> The path to the folder containing the unmodified reference genome
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6260 as well as the subfolders created by the Bismark_Genome_Preparation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6261 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6262 Bismark expects one or more fastA files in this folder (file extension: .fa
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6263 or .fasta). The path can be relative or absolute.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6264
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6265 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6266 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6267 correspond file-for-file and read-for-read with those specified in <mates2>.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6268 Reads may be a mix of different lengths. Bismark will produce one mapping result
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6269 and one report file per paired-end input file pair.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6270
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6271 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6272 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6273 correspond file-for-file and read-for-read with those specified in <mates1>.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6274 Reads may be a mix of different lengths.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6275
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6276 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6277 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6278 produce one mapping result and one report file per input file.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6279
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6280
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6281 OPTIONS:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6282
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6283
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6284 Input:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6285
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6286 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6287 files (usually having extension .fg or .fastq). This is the default. See also
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6288 --solexa-quals.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6289
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6290 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6291 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6292 are assumed to be 40 on the Phred scale.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6293
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6294 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6295
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6296 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6297
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6298 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6299
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6300 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6301
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6302 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6303 (which can't). The formula for conversion is:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6304 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6305 is usually the right option for use with (unconverted) reads emitted by the GA
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6306 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6307
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6308 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6309 reads emitted by GA Pipeline version 1.3 or later. Default: off.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6310
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6311 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6312 specified it is assumed that Bowtie (1 or 2) is in the PATH.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6313
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6314
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6315 Alignment:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6316
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6317 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6318 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6319 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6320
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6321 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6322 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6323 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6324
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6325 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6326 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6327 quality values to the nearest 10 and saturates at 30. This value is not relevant for
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6328 Bowtie 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6329
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6330 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6331 --best mode. Best-first search must keep track of many paths at once to ensure it is
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6332 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6333 memory impact of the descriptors, but they can still grow very large in some cases. If
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6334 you receive an error message saying that chunk memory has been exhausted in --best mode,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6335 try adjusting this parameter up to dedicate more memory to the descriptors. This value
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6336 is not relevant for Bowtie 2. Default: 512.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6337
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6338 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6339 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6340 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6341 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6342
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6343 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6344 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6345 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6346 A 61-bp gap would not be valid in that case. Default: 500.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6347
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6348
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6349 Bowtie 1 Reporting:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6350
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6351 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6352 will be used by default.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6353
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6354 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6355 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6356 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6357 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6358 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6359 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6360 the best alignment). --best mode also removes all strand bias. Note that --best does not
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6361 affect which alignments are considered "valid" by Bowtie, only which valid alignments
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6362 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6363 Default: on.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6364
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6365 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6366 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6367
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6368
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6369 Output:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6370
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6371 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6372 bisulfite strands will be reported. Default: OFF.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6373
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6374 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6375 to the original strands are merely theoretical and should not exist in reality. Specifying directional
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6376 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6377 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6378 for sprand-specific libraries).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6379
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6380 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6381 split up into several smaller files to run concurrently and the output files are to be merged.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6382
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6383 --quiet Print nothing besides alignments.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6384
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6385 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6386 of SAM format output.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6387
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6388 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6389 appear as they did in the input, without any translation of quality values that may have
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6390 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6391 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6392 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6393 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6394
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6395 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6396 mismatches or other reads that fail to align uniquely to a file in the output directory.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6397 Written reads will appear as they did in the input, without any of the translation of quality
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6398 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6399 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6400 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6401
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6402 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6403 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6404 to create it first. The path to the output folder can be either relative or absolute.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6405
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6406 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6407 the specified folder does not exist, Bismark will attempt to create it first. The path to the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6408 temporary folder can be either relative or absolute.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6409
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6410
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6411
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6412 Other:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6413
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6414 -h/--help Displays this help file.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6415
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6416 -v/--version Displays version information.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6417
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6418
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6419 BOWTIE 2 SPECIFIC OPTIONS
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6420
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6421 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6422 alignments, i.e. searches for alignments involving all read characters (also called
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6423 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6424 and/or quality trimmed where appropriate. Default: off.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6425
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6426 Bowtie 2 alignment options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6427
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6428 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6429 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6430 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6431 Bowtie 1 see -n).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6432
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6433 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6434 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6435 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6436 Bowtie 1 see -l).
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6437
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6438 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6439 position to be the highest possible, regardless of the actual value. I.e. input is treated
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6440 as though all quality values are high. This is also the default behavior when the input
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6441 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6442
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6443
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6444 Bowtie 2 paired-end options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6445
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6446 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6447 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6448 and on by default.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6449
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6450 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6451 A discordant alignment is an alignment where both mates align uniquely, but that does not
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6452 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6453 and it is on by default.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6454
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6455
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6456 Bowtie 2 effort options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6457
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6458 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6459 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6460 new second-best alignment. Default: 15.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6461
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6462 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6463 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6464 mismatches allowed) at different offsets and searches for more alignments. A read is considered
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6465 to have repetitive seeds if the total number of seed hits divided by the number of seeds
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6466 that aligned at least once is greater than 300. Default: 2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6467
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6468 Bowtie 2 parallelization options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6469
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6470
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6471 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6472 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6473 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6474 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6475 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6476 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6477 automatically use the option '--reorder', which guarantees that output SAM records are printed in
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6478 an order corresponding to the order of the reads in the original input file, even when -p is set
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6479 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6480 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6481 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6482 correspond to input order in that case.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6483
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6484 Bowtie 2 Scoring options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6485
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6486 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6487 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6488 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6489 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6490 L,0,-0.2.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6491
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6492
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6493 Bowtie 2 Reporting options:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6494
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6495 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6496 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6497 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6498 effort expended to find valid alignments.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6499
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6500 For reference, this used to be the old (now deprecated) description of -M:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6501 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6502 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6503 happens first. Only the best alignment is reported. Information from the other alignments is used to
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6504 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6505 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6506 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6507 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6508 always used and its default value is set to 10.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6509
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6510
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6511 'VANILLA' Bismark OUTPUT:
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6512
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6513 Single-end output format (tab-separated):
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6514
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6515 (1) <seq-ID>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6516 (2) <read alignment strand>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6517 (3) <chromosome>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6518 (4) <start position>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6519 (5) <end position>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6520 (6) <observed bisulfite sequence>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6521 (7) <equivalent genomic sequence>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6522 (8) <methylation call>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6523 (9) <read conversion
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6524 (10) <genome conversion>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6525 (11) <read quality score (Phred33)>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6526
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6527
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6528 Paired-end output format (tab-separated):
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6529 (1) <seq-ID>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6530 (2) <read 1 alignment strand>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6531 (3) <chromosome>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6532 (4) <start position>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6533 (5) <end position>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6534 (6) <observed bisulfite sequence 1>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6535 (7) <equivalent genomic sequence 1>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6536 (8) <methylation call 1>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6537 (9) <observed bisulfite sequence 2>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6538 (10) <equivalent genomic sequence 2>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6539 (11) <methylation call 2>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6540 (12) <read 1 conversion
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6541 (13) <genome conversion>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6542 (14) <read 1 quality score (Phred33)>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6543 (15) <read 2 quality score (Phred33)>
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6544
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6545
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6546 Bismark SAM OUTPUT (default):
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6547
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6548 (1) QNAME (seq-ID)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6549 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6550 (3) RNAME (chromosome)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6551 (4) POS (start position)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6552 (5) MAPQ (always 255)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6553 (6) CIGAR
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6554 (7) RNEXT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6555 (8) PNEXT
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6556 (9) TLEN
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6557 (10) SEQ
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6558 (11) QUAL (Phred33 scale)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6559 (12) NM-tag (edit distance to the reference)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6560 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6561 (14) XM-tag (methylation call string)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6562 (15) XR-tag (read conversion state for the alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6563 (16) XG-tag (genome conversion state for the alignment)
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6564
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6565 Each read of paired-end alignments is written out in a separate line in the above format.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6566
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6567
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6568 This script was last edited on 31 July 2012.
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6569
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6570 HOW_TO
2432df265dad Uploaded
fcaramia
parents:
diff changeset
6571 }