annotate bismark_wrapper/bismark_methylation_extractor @ 1:183de9d00131 draft

add indices.loc files
author bjoern-gruening
date Tue, 25 Dec 2012 05:52:28 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1 #!/usr/bin/perl
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2 use warnings;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3 use strict;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
4 $|++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
5 use Getopt::Long;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
6 use Cwd;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
7 use Carp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
8
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
9 my @filenames; # input files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
10 my %counting;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
11 my $parent_dir = getcwd();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
12
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
13 my %fhs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
14
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
15 my $version = 'v0.7.7';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
16 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome) = process_commandline();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
17
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
18
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
19 ### only needed for bedGraph output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
20 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
21 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
22 my @bedfiles;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
23
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
24 ### only needed for genome-wide cytosine methylation report
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
25 my %chromosomes;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
26
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
27 ##############################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
28 ### Summarising Run Parameters
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
29 ##############################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
30
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
31 ### METHYLATION EXTRACTOR
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
32
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
33 warn "Summarising Bismark methylation extractor parameters:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
34 warn '='x63,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
35
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
36 if ($single){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
37 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
38 warn "Bismark single-end vanilla format specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
39 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
40 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
41 warn "Bismark single-end SAM format specified (default)\n"; # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
42 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
43 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
44 elsif ($paired){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
45 if ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
46 warn "Bismark paired-end vanilla format specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
47 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
48 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
49 warn "Bismark paired-end SAM format specified (default)\n"; # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
50 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
51 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
52
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
53 if ($ignore){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
54 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
55 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
56
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
57 if ($full){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
58 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
59 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
60 if ($merge_non_CpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
61 warn "Merge CHG and CHH context to non-CpG context specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
62 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
63 ### output directory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
64 if ($output_dir eq ''){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
65 warn "Output will be written to the current directory ('$parent_dir')\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
66 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
67 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
68 warn "Output path specified as: $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
69 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
70
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
71
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
72 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
73
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
74 ### BEDGRAPH
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
75
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
76 if ($bedGraph){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
77 warn "\n\nSummarising bedGraph parameters:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
78 warn '='x63,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
79
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
80 if ($counts){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
81 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
82 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
83 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
84 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
85 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
86
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
87 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
88
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
89 if ($CX_context){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
90 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
91 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
92 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
93 $CpG_only = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
94 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
95 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
96
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
97 if ($remove){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
98 warn "White spaces in read ID names will be removed prior to sorting\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
99 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
100
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
101 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
102
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
103 if ($cytosine_report){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
104 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
105 warn '='x63,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
106 warn "Generating comprehensive genome-wide cytosine report (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> )\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
107
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
108
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
109 if ($CX_context){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
110 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
111 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
112 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
113 $CpG_only = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
114 warn "Reporting cytosine methylation in CpG context only (default)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
115 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
116
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
117 if ($split_by_chromosome){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
118 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
119 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
120
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
121 ### Zero-based coordinates
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
122 if ($zero){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
123 warn "Using zero-based genomic coordinates (user-defined)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
124 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
125 else{ # default, 1-based coords
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
126 warn "Using 1-based genomic coordinates (default)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
127 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
128
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
129 ### GENOME folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
130 if ($genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
131 unless ($genome_folder =~/\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
132 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
133 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
134 warn "Genome folder was specified as $genome_folder\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
135 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
136 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
137 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
138 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
139 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
140 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
141 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
142 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
143
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
144 warn "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
145 sleep (5);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
146
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
147 ######################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
148 ### PROCESSING FILES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
149 ######################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
150
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
151 foreach my $filename (@filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
152 # resetting counters and filehandles
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
153 %fhs = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
154 %counting =(
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
155 total_meCHG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
156 total_meCHH_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
157 total_meCpG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
158 total_unmethylated_CHG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
159 total_unmethylated_CHH_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
160 total_unmethylated_CpG_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
161 sequences_count => 0,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
162 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
163 @sorting_files = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
164 @bedfiles = ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
165
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
166 process_Bismark_results_file($filename);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
167
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
168 if ($bedGraph){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
169 my $out = $filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
170 $out =~ s/sam$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
171 $out =~ s/txt$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
172 $out =~ s/$/bedGraph/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
173
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
174 my $bedGraph_output = $out;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
175 open (OUT,'>',$output_dir.$out) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
176 # warn "Writing bedGraph to file: $out\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
177
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
178 process_bedGraph_output();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
179 close OUT or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
180
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
181 ### genome-wide cytosine methylation report requires bedGraph processing anyway
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
182 if ($cytosine_report){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
183 my $cytosine_out = $out;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
184 $cytosine_out =~ s/bedGraph$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
185
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
186 read_genome_into_memory();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
187 generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
188 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
189 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
190 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
191
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
192
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
193 sub process_commandline{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
194 my $help;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
195 my $single_end;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
196 my $paired_end;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
197 my $ignore;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
198 my $genomic_fasta;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
199 my $full;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
200 my $report;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
201 my $extractor_version;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
202 my $no_overlap;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
203 my $merge_non_CpG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
204 my $vanilla;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
205 my $output_dir;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
206 my $no_header;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
207 my $bedGraph;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
208 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
209 my $remove;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
210 my $counts;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
211 my $cytosine_report;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
212 my $genome_folder;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
213 my $zero;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
214 my $CpG_only;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
215 my $CX_context;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
216 my $split_by_chromosome;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
217
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
218
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
219 my $command_line = GetOptions ('help|man' => \$help,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
220 'p|paired-end' => \$paired_end,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
221 's|single-end' => \$single_end,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
222 'fasta' => \$genomic_fasta,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
223 'ignore=i' => \$ignore,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
224 'comprehensive' => \$full,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
225 'report' => \$report,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
226 'version' => \$extractor_version,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
227 'no_overlap' => \$no_overlap,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
228 'merge_non_CpG' => \$merge_non_CpG,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
229 'vanilla' => \$vanilla,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
230 'o|output=s' => \$output_dir,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
231 'no_header' => \$no_header,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
232 'bedGraph' => \$bedGraph,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
233 "cutoff=i" => \$coverage_threshold,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
234 "remove_spaces" => \$remove,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
235 "counts" => \$counts,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
236 "cytosine_report" => \$cytosine_report,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
237 'g|genome_folder=s' => \$genome_folder,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
238 "zero_based" => \$zero,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
239 "CX|CX_context" => \$CX_context,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
240 "split_by_chromosome" => \$split_by_chromosome,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
241 );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
242
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
243 ### EXIT ON ERROR if there were errors with any of the supplied options
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
244 unless ($command_line){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
245 die "Please respecify command line options\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
246 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
247
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
248 ### HELPFILE
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
249 if ($help){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
250 print_helpfile();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
251 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
252 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
253
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
254 if ($extractor_version){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
255 print << "VERSION";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
256
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
257
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
258 Bismark Methylation Extractor
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
259
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
260 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
261 www.bioinformatics.babraham.ac.uk/projects/bismark/
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
262
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
263
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
264 VERSION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
265 exit;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
266 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
267
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
268
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
269 ### no files provided
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
270 unless (@ARGV){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
271 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
273 @filenames = @ARGV;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
274
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
275 warn "\n *** Bismark methylation extractor version $version ***\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
276
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
277 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
278 unless ($ignore){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
279 $ignore = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
280 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
281 ### PRINT A REPORT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
282 unless ($report){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
283 $report = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
284 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
285
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
286 ### OUTPUT DIR PATH
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
287 if ($output_dir){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
288 unless ($output_dir =~ /\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
289 $output_dir =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
290 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
291 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
292 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
293 $output_dir = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
294 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
295
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
296 ### NO HEADER
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
297 unless ($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
298 $no_header = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
299 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
300
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
301 ### OLD (VANILLA) OUTPUT FORMAT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
302 unless ($vanilla){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
303 $vanilla = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
304 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
305
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
306 if ($single_end){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
307 $paired_end = 0; ### SINGLE END ALIGNMENTS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
308 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
309 elsif ($paired_end){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
310 $single_end = 0; ### PAIRED-END ALIGNMENTS
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
311 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
312 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
313 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
314 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
315
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
316 ### NO OVERLAP
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
317 if ($no_overlap){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
318 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
319 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
320 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
321 $no_overlap = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
322 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
323
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
324 ### COMPREHENSIVE OUTPUT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
325 unless ($full){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
326 $full = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
327 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
328
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
329 ### MERGE NON-CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
330 unless ($merge_non_CpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
331 $merge_non_CpG = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
332 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
333
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
334 ### remove white spaces in read ID (needed for sorting using the sort command
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
335 unless ($remove){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
336 $remove = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
337 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
338
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
339 ### COVERAGE THRESHOLD FOR gedGraph OUTPUT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
340 unless (defined $coverage_threshold){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
341 unless ($coverage_threshold > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
342 die "Please select a coverage greater than 0 (positive integers only)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
343 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
344 $coverage_threshold = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
345 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
346
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
347 if ($zero){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
348 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
349 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
350
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
351 if ($CX_context){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
352 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
353 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
354
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
355 if ($cytosine_report){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
356
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
357 ### GENOME folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
358 if ($genome_folder){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
359 unless ($genome_folder =~/\/$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
360 $genome_folder =~ s/$/\//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
361 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
362 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
363 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
364 die "Please specify a genome folder to proceed (full path only)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
365 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
366
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
367 unless ($bedGraph){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
368 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
369 $bedGraph = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
370 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
371 unless ($counts){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
372 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
373 $counts = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
374 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
375 warn "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
376 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
377
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
378 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
379 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
380
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
381
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
382 sub process_Bismark_results_file{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
383 my $filename = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
384
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
385 warn "\nNow reading in Bismark result file $filename\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
386
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
387 if ($filename =~ /\.gz$/) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
388 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
389 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
390 open (IN,$filename) or die "Can't open file $filename: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
391 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
392
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
393 ### Vanilla and SAM output need to read different numbers of header lines
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
394 if ($vanilla) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
395 my $bismark_version = <IN>; ## discarding the Bismark version info
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
396 chomp $bismark_version;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
397 $bismark_version =~ s/\r//; # replaces \r line feed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
398 $bismark_version =~ s/Bismark version: //;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
399 if ($bismark_version =~ /^\@/) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
400 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
401 sleep (2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
402 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
403
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
404 unless ($version eq $bismark_version){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
405 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
406 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
407 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
408 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
409 # We are reading from it further down
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
410 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
411
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
412 my $output_filename = (split (/\//,$filename))[-1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
413
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
414 ### OPENING OUTPUT-FILEHANDLES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
415 if ($report) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
416 my $report_filename = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
417 $report_filename =~ s/[\.sam|\.txt]$//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
418 $report_filename =~ s/$/_splitting_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
419 $report_filename = $output_dir . $report_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
420 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
421 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
422
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
423 if ($report) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
424 print REPORT "$output_filename\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
425 print REPORT "Parameters used to extract methylation information:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
426 if ($paired) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
427 if ($vanilla) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
428 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
429 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
430 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
431 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
432 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
433
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
434 if ($single) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
435 if ($vanilla) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
436 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
437 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
438 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
439 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
440 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
441
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
442 if ($ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
443 print REPORT "Ignoring first $ignore bases\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
444 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
445
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
446 if ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
447 print REPORT "Output specified: comprehensive\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
448 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
449 print REPORT "Output specified: strand-specific (default)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
450 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
451
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
452 if ($no_overlap) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
453 print REPORT "No overlapping methylation calls specified\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
454 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
455 if ($genomic_fasta) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
456 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
457 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
458 if ($merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
459 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
460 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
461
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
462 print REPORT "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
463 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
464
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
465 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
466 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
467 if ($full and $merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
468 my $cpg_output = my $other_c_output = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
469 ### C in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
470 $cpg_output =~ s/^/CpG_context_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
471 $cpg_output =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
472 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
473 $cpg_output = $output_dir . $cpg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
474 push @sorting_files,$cpg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
475 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
476 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
477
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
478 unless ($no_header) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
479 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
480 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
481
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
482 ### C in any other context than CpG
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
483 $other_c_output =~ s/^/Non_CpG_context_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
484 $other_c_output =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
485 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
486 $other_c_output = $output_dir . $other_c_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
487 push @sorting_files,$other_c_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
488 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
489 print "Writing result file containing methylation information for C in any other context to $other_c_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
490
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
491 unless ($no_header) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
492 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
493 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
494 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
495
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
496 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
497 elsif ($merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
498
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
499 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
500
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
501 ### For cytosines in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
502 $cpg_ot =~ s/^/CpG_OT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
503 $cpg_ot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
504 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
505 $cpg_ot = $output_dir . $cpg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
506 push @sorting_files,$cpg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
507 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
508 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
509
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
510 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
511 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
512 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
513
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
514 $cpg_ctot =~ s/^/CpG_CTOT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
515 $cpg_ctot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
516 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
517 $cpg_ctot = $output_dir . $cpg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
518 push @sorting_files,$cpg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
519 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
520 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
521
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
522 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
523 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
524 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
525
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
526 $cpg_ctob =~ s/^/CpG_CTOB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
527 $cpg_ctob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
528 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
529 $cpg_ctob = $output_dir . $cpg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
530 push @sorting_files,$cpg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
531 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
532 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
533
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
534 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
535 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
536 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
537
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
538 $cpg_ob =~ s/^/CpG_OB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
539 $cpg_ob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
540 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
541 $cpg_ob = $output_dir . $cpg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
542 push @sorting_files,$cpg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
543 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
544 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
545
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
546 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
547 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
548 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
549
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
550 ### For cytosines in Non-CpG (CC, CT or CA) context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
551 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
552
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
553 $other_c_ot =~ s/^/Non_CpG_OT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
554 $other_c_ot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
555 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
556 $other_c_ot = $output_dir . $other_c_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
557 push @sorting_files,$other_c_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
558 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
559 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
560
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
561 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
562 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
563 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
564
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
565 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
566 $other_c_ctot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
567 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
568 $other_c_ctot = $output_dir . $other_c_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
569 push @sorting_files,$other_c_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
570 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
571 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
572
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
573 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
574 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
575 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
576
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
577 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
578 $other_c_ctob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
579 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
580 $other_c_ctob = $output_dir . $other_c_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
581 push @sorting_files,$other_c_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
582 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
583 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
584
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
585 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
586 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
587 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
588
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
589 $other_c_ob =~ s/^/Non_CpG_OB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
590 $other_c_ob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
591 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
592 $other_c_ob = $output_dir . $other_c_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
593 push @sorting_files,$other_c_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
594 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
595 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
596
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
597 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
598 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
599 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
600 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
601 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
602
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
603 ### if --comprehensive was specified we are only writing one file per context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
604 elsif ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
605 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
606 ### C in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
607 $cpg_output =~ s/^/CpG_context_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
608 $cpg_output =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
609 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
610 $cpg_output = $output_dir . $cpg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
611 push @sorting_files,$cpg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
612 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
613 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
614
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
615 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
616 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
617 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
618
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
619 ### C in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
620 $chg_output =~ s/^/CHG_context_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
621 $chg_output =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
622 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
623 $chg_output = $output_dir . $chg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
624 push @sorting_files,$chg_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
625 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
626 print "Writing result file containing methylation information for C in CHG context to $chg_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
627
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
628 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
629 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
630 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
631
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
632 ### C in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
633 $chh_output =~ s/^/CHH_context_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
634 $chh_output =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
635 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
636 $chh_output = $output_dir . $chh_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
637 push @sorting_files, $chh_output;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
638 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
639 print "Writing result file containing methylation information for C in CHH context to $chh_output\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
640
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
641 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
642 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
643 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
644 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
645 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
646 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
647 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
648
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
649 ### For cytosines in CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
650 $cpg_ot =~ s/^/CpG_OT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
651 $cpg_ot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
652 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
653 $cpg_ot = $output_dir . $cpg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
654 push @sorting_files,$cpg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
655 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
656 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
657
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
658 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
659 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
660 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
661
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
662 $cpg_ctot =~ s/^/CpG_CTOT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
663 $cpg_ctot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
664 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
665 $cpg_ctot = $output_dir . $cpg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
666 push @sorting_files,$cpg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
667 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
668 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
669
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
670 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
671 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
672 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
673
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
674 $cpg_ctob =~ s/^/CpG_CTOB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
675 $cpg_ctob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
676 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
677 $cpg_ctob = $output_dir . $cpg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
678 push @sorting_files,$cpg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
679 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
680 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
681
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
682 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
683 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
684 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
685
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
686 $cpg_ob =~ s/^/CpG_OB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
687 $cpg_ob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
688 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
689 $cpg_ob = $output_dir . $cpg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
690 push @sorting_files,$cpg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
691 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
692 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
693
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
694 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
695 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
696 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
697
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
698 ### For cytosines in CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
699 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
700
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
701 $chg_ot =~ s/^/CHG_OT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
702 $chg_ot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
703 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
704 $chg_ot = $output_dir . $chg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
705 push @sorting_files,$chg_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
706 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
707 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
708
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
709 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
710 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
711 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
712
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
713 $chg_ctot =~ s/^/CHG_CTOT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
714 $chg_ctot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
715 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
716 $chg_ctot = $output_dir . $chg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
717 push @sorting_files,$chg_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
718 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
719 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
720
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
721 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
722 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
723 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
724
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
725 $chg_ctob =~ s/^/CHG_CTOB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
726 $chg_ctob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
727 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
728 $chg_ctob = $output_dir . $chg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
729 push @sorting_files,$chg_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
730 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
731 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
732
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
733 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
734 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
735 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
736
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
737 $chg_ob =~ s/^/CHG_OB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
738 $chg_ob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
739 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
740 $chg_ob = $output_dir . $chg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
741 push @sorting_files,$chg_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
742 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
743 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
744
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
745 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
746 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
747 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
748
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
749 ### For cytosines in CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
750 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
751
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
752 $chh_ot =~ s/^/CHH_OT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
753 $chh_ot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
754 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
755 $chh_ot = $output_dir . $chh_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
756 push @sorting_files,$chh_ot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
757 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
758 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
759
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
760 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
761 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
762 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
763
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
764 $chh_ctot =~ s/^/CHH_CTOT_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
765 $chh_ctot =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
766 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
767 $chh_ctot = $output_dir . $chh_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
768 push @sorting_files,$chh_ctot;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
769 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
770 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
771
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
772 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
773 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
774 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
775
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
776 $chh_ctob =~ s/^/CHH_CTOB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
777 $chh_ctob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
778 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
779 $chh_ctob = $output_dir . $chh_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
780 push @sorting_files,$chh_ctob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
781 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
782 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
783
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
784 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
785 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
786 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
787
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
788 $chh_ob =~ s/^/CHH_OB_/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
789 $chh_ob =~ s/sam$/txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
790 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
791 $chh_ob = $output_dir . $chh_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
792 push @sorting_files,$chh_ob;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
793 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
794 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
795
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
796 unless($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
797 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
798 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
799 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
800
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
801 my $methylation_call_strings_processed = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
802 my $line_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
803
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
804 ### proceeding differently now for single-end or paired-end Bismark files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
805
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
806 ### PROCESSING SINGLE-END RESULT FILES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
807 if ($single) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
808
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
809 ### also proceeding differently now for SAM format or vanilla Bismark format files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
810 if ($vanilla) { # old vanilla Bismark output format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
811 while (<IN>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
812 ++$line_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
813 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
814
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
815 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
816 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
817
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
818 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
819 ### last position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
820 chomp $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
821
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
822 my $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
823 if ($meth_call) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
824
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
826 $index = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
827 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
828 $index = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
829 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
830 $index = 3;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
832 $index = 2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
833 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
834 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
835 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
836
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
837 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
838 if ($ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
839 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
840
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
841 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
842 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
843 $start += $ignore;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
844 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
845 $start += length($meth_call)-1; ## $meth_call is already shortened!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
846 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
847 die "Alignment did not have proper strand information: $strand\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
848 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
849 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
850 ### printing out the methylation state of every C in the read
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
851 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
852
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
853 ++$methylation_call_strings_processed; # 1 per single-end result
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
854 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
855 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
856 } else { # processing single-end SAM format (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
857 while (<IN>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
858 ### SAM format can either start with header lines (starting with @) or start with alignments directly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
859 if (/^\@/) { # skipping header lines (starting with @)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
860 warn "skipping SAM header line:\t$_";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
861 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
862 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
863
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
864 ++$line_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
865 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
866
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
867 # example read in SAM format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
868 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
869 ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
870
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
871 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
872 # < 0.7.6 $meth_call =~ s/^XM:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
873 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
874 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
875
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
876 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
877
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
878 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
879 my $meth_call; ### Thanks to Zachary Zeno for this solution
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
880 my $read_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
881 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
882
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
883 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
884 my $tag = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
885 my $value = $2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
886
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
887 if ($tag eq "XM") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
888 $meth_call = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
889 $meth_call =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
890 } elsif ($tag eq "XR") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
891 $read_conversion = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
892 $read_conversion =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
893 } elsif ($tag eq "XG") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
894 $genome_conversion = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
895 $genome_conversion =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
896 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
897 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
898
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
899 my $strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
900 chomp $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
901 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
902
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
903 my $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
904 if ($meth_call) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
905 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
906 $index = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
907 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
908 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
909 $index = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
910 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
911 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
912 $index = 2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
913 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
914 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
915 $index = 3;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
916 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
917 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
918 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
919 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
920
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
921 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
922 if ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
923 $meth_call = reverse $meth_call;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
924 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
925
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
926 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
927 if ($ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
928 # print "\n\n$meth_call\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
929 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
930 # print "$meth_call\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
931 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
932
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
933 my @len = split (/\D+/,$cigar); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
934 my @ops = split (/\d+/,$cigar); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
935 shift @ops; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
936 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
937
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
938 my @comp_cigar; # building an array with all CIGAR operations
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
939 foreach my $index (0..$#len) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
940 foreach (1..$len[$index]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
941 # print "$ops[$index]";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
942 push @comp_cigar, $ops[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
943 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
944 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
945 # print "original CIGAR: $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
946 # print "original CIGAR: @comp_cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
947
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
948 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
949 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
951 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
952 my $I_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
953
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
954 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
955 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
956 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
957
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
958 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
959 $D_count++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
960 $op = shift @comp_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
961 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
962 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
963 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
964 $I_count++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
965 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
966 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
967 $start += $ignore + $D_count - $I_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
968 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
969 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
970
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
971 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
972 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
973 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
974 $op = pop @comp_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
975 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
976 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
977
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
978 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
979 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
980 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
981 foreach (@comp_cigar) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
982 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
983 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
984 $start += $MD_count - 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
985 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
986
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
987 ### reconstituting shortened CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
988 my $new_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
989 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
990 my $last_op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
991 # print "ignore adjusted: @comp_cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
992 foreach my $op (@comp_cigar) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
993 unless (defined $last_op){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
994 $last_op = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
995 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
996 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
997 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
998 if ($last_op eq $op) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
999 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1000 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1001 $new_cigar .= "$count$last_op";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1002 $last_op = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1003 $count = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1004 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1005 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1006 $new_cigar .= "$count$last_op"; # appending the last operation and count
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1007 $cigar = $new_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1008 # print "ignore adjusted scalar: $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1009 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1010 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1011 ### printing out the methylation state of every C in the read
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1012 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1013
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1014 ++$methylation_call_strings_processed; # 1 per single-end result
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1015 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1016 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1017 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1018
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1019 ### PROCESSING PAIRED-END RESULT FILES
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1020 elsif ($paired) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1021
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1022 ### proceeding differently now for SAM format or vanilla Bismark format files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1023 if ($vanilla) { # old vanilla Bismark paired-end output format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1024 while (<IN>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1025 ++$line_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1026 warn "processed line: $line_count\n" if ($line_count%500000==0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1027
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1028 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1029 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1030
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1031 my $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1032 chomp $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1033
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1034 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1035 $index = 0; ## this is OT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1036 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1037 $index = 2; ## this is CTOB!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1038 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1039 $index = 1; ## this is CTOT!!!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1040 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1041 $index = 3; ## this is OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1042 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1043 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1044 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1045
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1046 if ($meth_call_1 and $meth_call_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1047 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1048 if ($ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1049 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1050 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1051
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1052 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1053 $start_read_1 += $ignore;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1054 $end_read_2 -= $ignore;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1055 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1056 my $end_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1057 my $start_read_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1058
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1059 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1060
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1061 $end_read_1 = $start_read_1+length($meth_call_1)-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1062 $start_read_2 = $end_read_2-length($meth_call_2)+1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1063
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1064 ## we first pass the first read which is in + orientation on the forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1065 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1066
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1067 # we next pass the second read which is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1068 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1069 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1070 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1071
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1072 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1073 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1074
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1075 ## we first pass the first read which is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1076 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1077
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1078 # we next pass the second read which is in + orientation on the forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1079 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1080 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1081 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1082
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1083 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1084 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1085 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1086 } else { # Bismark paired-end SAM output format (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1087 while (<IN>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1088 ### SAM format can either start with header lines (starting with @) or start with alignments directly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1089 if (/^\@/) { # skipping header lines (starting with @)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1090 warn "skipping SAM header line:\t$_";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1091 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1092 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1093
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1094 ++$line_count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1095 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1096
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1097 # example paired-end reads in SAM format (2 consecutive lines)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1098 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1099 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1100
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1101 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1102
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1103 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1104 my $meth_call_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1105 my $first_read_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1106 my $genome_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1107
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1108 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1109 my $tag = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1110 my $value = $2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1111
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1112 if ($tag eq "XM") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1113 $meth_call_1 = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1114 $meth_call_1 =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1115 } elsif ($tag eq "XR") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1116 $first_read_conversion = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1117 $first_read_conversion =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1118 } elsif ($tag eq "XG") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1119 $genome_conversion = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1120 $genome_conversion =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1121 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1122 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1123
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1124 $_ = <IN>; # reading in the paired read
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1125
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1126 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1127 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1128 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1129 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1130 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1131
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1132 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1133
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1134 my $meth_call_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1135 my $second_read_conversion;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1136
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1137 while ( /(XM|XR):Z:([^\t]+)/g ) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1138 my $tag = $1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1139 my $value = $2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1140
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1141 if ($tag eq "XM") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1142 $meth_call_2 = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1143 $meth_call_2 =~ s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1144 } elsif ($tag eq "XR") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1145 $second_read_conversion = $value;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1146 $second_read_conversion = s/\r//;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1147 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1148 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1149
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1150 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1151 chomp $genome_conversion; # in case it captured a new line character
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1152
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1153 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1154
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1155 my $index;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1156 my $strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1157
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1158 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1159 $index = 0; ## this is OT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1160 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1161 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1162 $index = 1; ## this is CTOT
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1163 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1164 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1165 $index = 2; ## this is CTOB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1166 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1167 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1168 $index = 3; ## this is OB
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1169 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1170 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1171 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1172 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1173
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1174 ### reversing the methylation call of the read that was reverse-complemented
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1175 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1176 $meth_call_2 = reverse $meth_call_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1177 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1178 $meth_call_1 = reverse $meth_call_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1179 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1180
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1181 if ($meth_call_1 and $meth_call_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1182
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1183 my $end_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1184
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1185 ### READ 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1186 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1187 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1188 shift @ops_1; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1189 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1190
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1191 my @comp_cigar_1; # building an array with all CIGAR operations
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1192 foreach my $index (0..$#len_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1193 foreach (1..$len_1[$index]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1194 # print "$ops_1[$index]";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1195 push @comp_cigar_1, $ops_1[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1196 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1197 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1198 # print "original CIGAR read 1: $cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1199 # print "original CIGAR read 1: @comp_cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1200
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1201 ### READ 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1202 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1203 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1204 shift @ops_2; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1205 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1206 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1207 foreach my $index (0..$#len_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1208 foreach (1..$len_2[$index]) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1209 # print "$ops_2[$index]";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1210 push @comp_cigar_2, $ops_2[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1211 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1212 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1213 # print "original CIGAR read 2: $cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1214 # print "original CIGAR read 2: @comp_cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1215
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1216 if ($ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1217 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1218 ### the methylation calls have already been reversed where necessary
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1219 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1220 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1221
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1222 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1223
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1224 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1225
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1226 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1227 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1228 my $I_count_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1229
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1230 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1231 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1232 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1233
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1234 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1235 $D_count_1++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1236 $op = shift @comp_cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1237 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1238 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1239 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1240 $I_count_1++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1241 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1242 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1243
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1244 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1245 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1246
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1247 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1248
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1249 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1250 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1251 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1252 $op = pop @comp_cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1253 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1254 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1255 # the start position of reads mapping to the reverse strand is being adjusted further below
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1256 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1257
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1258 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1259 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1260 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1261 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1262 $op = pop @comp_cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1263 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1264 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1265 # the start position of reads mapping to the reverse strand is being adjusted further below
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1266
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1267 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1268 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1269 my $I_count_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1270
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1271 for (1..$ignore) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1272 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1273 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1274
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1275 while ($op eq 'D') { # repeating this for deletions (D)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1276 $D_count_2++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1277 $op = shift @comp_cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1278 # print "$_ deleted $op\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1279 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1280 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1281 $I_count_2++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1282 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1283 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1284
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1285 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1286 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1287
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1288 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1289
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1290 ### reconstituting shortened CIGAR string 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1291 my $new_cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1292 my $count_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1293 my $last_op_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1294 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1295 foreach my $op (@comp_cigar_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1296 unless (defined $last_op_1){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1297 $last_op_1 = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1298 ++$count_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1299 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1300 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1301 if ($last_op_1 eq $op) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1302 ++$count_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1303 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1304 $new_cigar_1 .= "$count_1$last_op_1";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1305 $last_op_1 = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1306 $count_1 = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1307 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1308 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1309 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1310 $cigar_1 = $new_cigar_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1311 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1312
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1313 ### reconstituting shortened CIGAR string 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1314 my $new_cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1315 my $count_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1316 my $last_op_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1317 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1318 foreach my $op (@comp_cigar_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1319 unless (defined $last_op_2){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1320 $last_op_2 = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1321 ++$count_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1322 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1323 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1324 if ($last_op_2 eq $op) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1325 ++$count_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1326 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1327 $new_cigar_2 .= "$count_2$last_op_2";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1328 $last_op_2 = $op;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1329 $count_2 = 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1330 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1331 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1332 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1333 $cigar_2 = $new_cigar_2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1334 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1335
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1336 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1337
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1338 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1339 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1340 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1341 # print "reverse: @comp_cigar_2\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1342
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1343 my $MD_count_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1344 foreach (@comp_cigar_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1345 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1346 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1347
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1348 my $MD_count_2 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1349 foreach (@comp_cigar_2) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1350 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1351 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1352
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1353 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1354 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1355 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1356 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1357
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1358 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1359 # print "reverse: @comp_cigar_1\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1360
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1361 my $MD_count_1 = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1362 foreach (@comp_cigar_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1363 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1364 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1365
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1366 $end_read_1 = $start_read_1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1367 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1368
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1369 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1370
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1371 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1372 ## we first pass the first read which is in + orientation on the forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1373 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1374
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1375 # we next pass the second read which is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1376 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1377 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1378 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1379 ## we first pass the first read which is in - orientation on the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1380 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1381
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1382 # we next pass the second read which is in + orientation on the forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1383 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1384 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1385 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1386
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1387 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1388 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1389 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1390 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1391 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1392 die "Single-end or paired-end reads not specified properly\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1393 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1394
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1395 print "\n\nProcessed $line_count lines from $filename in total\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1396 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1397 if ($report) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1398 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1399 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1400 print_splitting_report ();
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1401 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1402
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1403
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1404
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1405 sub print_splitting_report{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1406
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1407 ### Calculating methylation percentages if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1408
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1409 my $percent_meCpG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1410 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1411 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1412 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1413
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1414 my $percent_meCHG;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1415 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1416 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1417 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1418
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1419 my $percent_meCHH;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1420 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1421 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1422 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1423
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1424 my $percent_non_CpG_methylation;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1425 if ($merge_non_CpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1426 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1427 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1428 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1429 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1430
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1431 if ($report){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1432 ### detailed information about Cs analysed
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1433 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1434
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1435 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1436 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1437
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1438 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1439 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1440 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1441
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1442 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1443 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1444 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1445
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1446 ### calculating methylated CpG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1447 if ($percent_meCpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1448 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1449 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1450 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1451 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1452 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1453
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1454 ### 2-Context Output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1455 if ($merge_non_CpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1456 if ($percent_non_CpG_methylation){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1457 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1458 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1459 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1460 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1461 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1462 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1463
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1464 ### 3 Context Output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1465 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1466 ### calculating methylated CHG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1467 if ($percent_meCHG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1468 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1469 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1470 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1471 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1472 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1473
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1474 ### calculating methylated CHH percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1475 if ($percent_meCHH){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1476 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1477 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1478 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1479 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1480 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1481 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1482 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1483
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1484 ### detailed information about Cs analysed for on-screen report
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1485 print "Final Cytosine Methylation Report\n",'='x33,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1486
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1487 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1488 print "Total number of C's analysed:\t$total_number_of_C\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1489
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1490 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1491 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1492 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1493
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1494 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1495 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1496 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1497
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1498 ### printing methylated CpG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1499 if ($percent_meCpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1500 print "C methylated in CpG context:\t${percent_meCpG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1501 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1502 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1503 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1504 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1505
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1506 ### 2-Context Output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1507 if ($merge_non_CpG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1508 if ($percent_non_CpG_methylation){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1509 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1510 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1511 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1512 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1513 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1514 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1515
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1516 ### 3-Context Output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1517 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1518 ### printing methylated CHG percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1519 if ($percent_meCHG){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1520 print "C methylated in CHG context:\t${percent_meCHG}%\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1521 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1522 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1523 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1524 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1525
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1526 ### printing methylated CHH percentage if applicable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1527 if ($percent_meCHH){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1528 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1529 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1530 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1531 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1532 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1533 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1534 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1535
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1536
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1537
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1538
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1539
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1540 sub print_individual_C_methylation_states_paired_end_files{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1541
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1542 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1543 my @methylation_calls = split(//,$meth_call);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1544
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1545 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1546 ### . for bases not involving cytosines ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1547 ### X for methylated C in CHG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1548 ### x for not methylated C in CHG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1549 ### H for methylated C in CHH context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1550 ### h for not methylated C in CHH context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1551 ### Z for methylated C in CpG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1552 ### z for not methylated C in CpG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1553 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1554
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1555 my $methyl_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1556 my $methyl_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1557 my $methyl_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1558 my $unmethylated_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1559 my $unmethylated_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1560 my $unmethylated_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1561
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1562 my @len;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1563 my @ops;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1564 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1565 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1566 my @comp_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1567
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1568 if ($cigar){ # parsing CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1569 @len = split (/\D+/,$cigar); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1570 @ops = split (/\d+/,$cigar); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1571 shift @ops; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1573
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1574 foreach my $index (0..$#len){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1575 foreach (1..$len[$index]){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1576 # print "$ops[$index]";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1577 push @comp_cigar, $ops[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1578 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1579 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1580 # warn "\nDetected CIGAR string: $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1581 # warn "Length of methylation call: ",length $meth_call,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1582 # warn "number of operations: ",scalar @ops,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1583 # warn "number of length digits: ",scalar @len,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1584 # print @comp_cigar,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1585 # print "$meth_call\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1586 # sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1587 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1588
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1589
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1590 if ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1591
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1592 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1593 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1594 # print "reverse CIGAR string: @comp_cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1595
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1596 ### the start position of paired-end files has already been corrected, see above
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1597 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1598
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1599 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1600
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1601 if ($merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1602
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1603 if ($no_overlap) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1604
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1605 ### single-file CpG and non-CpG context output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1606 if ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1607 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1608 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1609
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1610 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1611 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1612 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1613 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1614 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1615 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1616
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1617 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1618 if ($start+$index+$pos_offset >= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1619 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1620 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1621
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1622 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1623 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1624 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1625 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1626 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1627 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1628 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1629 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1630 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1631 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1632 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1633 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1634 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1635 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1636 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1637 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1638 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1639 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1640 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1641 elsif ($methylation_calls[$index] eq '.'){}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1642 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1643 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1644 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1645 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1646 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1647 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1648
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1649 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1650 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1651 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1652 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1653 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1654 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1655
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1656 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1657 if ($start-$index+$pos_offset <= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1658 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1659 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1660
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1661 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1662 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1663 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1664 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1665 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1666 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1667 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1668 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1669 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1670 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1671 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1672 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1673 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1674 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1675 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1676 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1677 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1679 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1680 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1681 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1682 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1683 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1684 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1685 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1686 die "The read orientation was neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1687 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1688 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1689
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1690 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1691 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1692 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1693 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1694
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1695 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1696 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1697 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1698 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1699 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1700 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1701
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1702 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1703 if ($start+$index+$pos_offset >= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1704 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1705 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1706
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1707 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1708 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1709 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1710 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1711 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1712 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1713 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1714 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1716 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1717 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1719 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1720 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1721 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1722 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1723 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1724 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1725 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1726 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1727 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1729 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1730 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1731 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1732 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1733
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1734 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1735 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1736 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1737 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1738 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1739 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1740
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1741 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1742 if ($start-$index+$pos_offset <= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1743 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1744 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1745
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1746 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1747 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1748 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1749 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1750 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1751 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1752 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1753 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1754 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1755 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1756 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1757 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1758 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1759 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1760 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1761 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1762 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1763 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1764 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1765 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1766 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1768 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1769 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1770 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1771 die "The strand orientation was neither + nor -: '$strand'/n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1772 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1773 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1774 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1775
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1776 ### this is the default paired-end procedure allowing overlaps and using every single C position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1777 ### Still within the 2-CONTEXT ONLY optional section
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1778 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1779 ### single-file CpG and non-CpG context output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1780 if ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1781 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1782 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1783
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1784 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1785 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1786 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1787 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1788 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1789 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1790
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1791 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1792 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1793 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1794 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1795 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1796 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1797 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1798 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1799 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1800 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1801 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1802 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1803 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1804 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1805 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1806 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1807 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1808 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1809 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1810 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1811 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1812 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1813 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1814 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1815 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1816 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1817
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1818 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1819 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1820 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1821 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1822 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1823 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1824
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1825 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1826 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1827 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1828 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1829 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1830 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1831 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1832 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1833 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1834 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1835 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1836 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1837 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1838 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1839 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1840 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1841 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1842 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1843 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1844 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1845 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1846 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1847 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1848 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1849 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1850 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1851 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1852 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1853
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1854 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1855 ### still within the 2-CONTEXT optional section
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1856 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1857 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1858 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1859
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1860 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1861 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1863 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1864 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1865 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1866
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1867 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1868 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1869 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1870 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1871 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1872 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1873 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1874 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1875 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1876 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1877 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1878 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1879 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1880 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1881 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1882 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1883 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1884 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1885 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1886 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1887 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1888 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1889 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1890 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1891 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1892 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1893
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1894 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1895 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1896 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1897 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1898 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1899 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1900
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1901 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1902 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1903 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1904 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1905 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1906 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1907 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1908 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1909 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1910 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1911 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1912 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1913 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1914 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1915 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1916 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1917 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1918 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1919 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1920 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1921 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1922 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1923 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1924 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1925 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1926 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1927 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1928 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1929 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1930 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1931
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1932 ############################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1933 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1934 ############################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1935
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1936 elsif ($no_overlap) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1937 ### single-file CpG, CHG and CHH context output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1938 if ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1939 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1940 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1941
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1942 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1945 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1946 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1947 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1948
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1949 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1950 if ($start+$index+$pos_offset >= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1951 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1952 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1953
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1954 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1955 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1956 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1957 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1958 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1959 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1960 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1961 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1962 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1963 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1964 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1965 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1966 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1967 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1968 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1969 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1970 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1971 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1972 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1973 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1974 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1975 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1976 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1977 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1978 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1979 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1980
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1981 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1982 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1983 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1984 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1985 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1986 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1987
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1988 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1989 if ($start-$index+$pos_offset <= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1990 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1991 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1992
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1993 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1994 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1995 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1996 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1997 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1998 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
1999 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2000 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2001 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2002 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2003 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2004 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2005 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2006 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2007 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2008 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2009 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2010 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2011 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2012 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2013 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2014 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2015 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2016 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2017 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2018 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2019 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2020 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2021
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2022 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2023 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2024 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2025 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2026
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2027 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2028 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2030 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2031 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2032 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2033
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2034 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2035 if ($start+$index+$pos_offset >= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2036 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2037 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2038
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2039 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2040 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2041 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2042 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2043 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2044 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2045 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2046 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2047 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2048 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2049 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2050 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2051 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2052 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2053 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2054 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2055 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2056 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2057 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2058 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2059 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2060 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2061 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2062 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2063 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2064 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2065
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2066 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2067 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2068 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2069 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2070 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2071 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2072
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2073 ### Returning as soon as the methylation calls start overlapping
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2074 if ($start-$index+$pos_offset <= $end_read_1) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2075 return;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2076 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2077
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2078 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2079 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2080 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2081 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2082 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2083 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2084 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2085 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2086 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2087 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2088 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2089 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2090 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2091 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2092 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2093 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2094 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2095 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2096 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2097 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2098 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2100 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2101 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2102 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2103 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2104 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2105 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2106 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2107
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2108 ### this is the default paired-end procedure allowing overlaps and using every single C position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2109 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2110 ### single-file CpG, CHG and CHH context output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2111 if ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2112 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2113 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2114
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2115 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2116 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2117 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2118 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2119 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2120 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2121
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2122 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2123 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2124 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2125 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2126 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2127 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2128 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2129 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2130 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2131 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2132 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2133 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2134 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2135 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2136 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2137 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2138 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2139 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2140 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2141 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2142 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2143 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2144 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2145 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2146 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2147 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2148
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2149 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2150 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2151 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2152 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2153 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2154 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2155
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2156 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2157 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2158 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2159 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2160 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2161 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2162 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2163 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2164 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2165 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2166 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2167 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2168 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2169 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2170 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2171 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2172 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2173 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2174 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2175 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2176 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2178 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2179 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2180 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2181 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2182 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2183 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2184
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2185 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2186 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2187 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2188 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2189
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2190 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2191 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2192 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2193 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2194 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2195 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2196
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2197 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2198 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2199 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2200 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2201 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2202 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2203 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2204 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2205 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2206 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2207 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2208 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2209 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2210 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2211 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2212 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2213 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2214 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2215 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2216 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2217 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2218 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2219 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2220 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2221 } elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2222 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2223
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2224 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2225 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2226 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2227 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2228 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2229 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2230
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2231 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2232 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2233 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2234 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2235 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2236 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2237 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2238 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2239 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2240 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2241 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2242 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2243 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2244 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2245 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2246 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2247 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2248 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2249 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2250 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2251 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2252 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2253 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2254 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2255 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2256 die "The strand orientation as neither + nor -: '$strand'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2257 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2258 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2259 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2260 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2261
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2262 sub check_cigar_string {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2263 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2264 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2265 my ($new_cigar_offset,$new_pos_offset) = (0,0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2266
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2267 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2268 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2269
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2270 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2271 # warn "position needs no adjustment\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2272 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2273
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2274 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2275 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2276 # warn "adjusted genomic position by -1 bp (insertion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2277 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2278
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2280 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2281 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2282 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2283
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2284 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2285 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2286 # warn "position needs no adjustment\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2287 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2288 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2289 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2290 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2291 # warn "adjusted genomic position by another -1 bp (insertion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2292 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2293 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2295 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2296 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2297 # warn "adjusted genomic position by another +1 bp (deletion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2298 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2299 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2300 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2301 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2302 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2303 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2304 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2306 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2307 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2308
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2309 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2310 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2311
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2312 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2313 # warn "position needs no adjustment\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2314 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2315
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2316 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2317 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2318 # warn "adjusted genomic position by +1 bp (insertion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2319 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2320
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2322 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2323 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2324 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2325
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2326 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2327 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2328 # warn "Found new 'M' operation; position needs no adjustment\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2329 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2330 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2331 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2332 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2333 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2334 last;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2335 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2337 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2338 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2339 # warn "adjusted genomic position by another -1 bp (deletion)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2340 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2341 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2342 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2343 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2344 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2345 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2346 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2348 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2349 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2350 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2351 return ($new_cigar_offset,$new_pos_offset);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2352 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2353
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2354 sub print_individual_C_methylation_states_single_end{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2355
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2356 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2357 my @methylation_calls = split(//,$meth_call);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2358
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2359 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2360 ### . for bases not involving cytosines ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2361 ### X for methylated C in CHG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2362 ### x for not methylated C in CHG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2363 ### H for methylated C in CHH context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2364 ### h for not methylated C in CHH context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2365 ### Z for methylated C in CpG context (was protected) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2366 ### z for not methylated C in CpG context (was converted) ###
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2367 #################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2368
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2369 my $methyl_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2370 my $methyl_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2371 my $methyl_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2372 my $unmethylated_CHG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2373 my $unmethylated_CHH_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2374 my $unmethylated_CpG_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2375
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2376 my @len;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2377 my @ops;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2378 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2379 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2380
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2381 my @comp_cigar;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2382
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2383 if ($cigar){ # parsing CIGAR string
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2384 @len = split (/\D+/,$cigar); # storing the length per operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2385 @ops = split (/\d+/,$cigar); # storing the operation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2386 shift @ops; # remove the empty first element
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2387 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2388
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2389 foreach my $index (0..$#len){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2390 foreach (1..$len[$index]){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2391 # print "$ops[$index]";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2392 push @comp_cigar, $ops[$index];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2393 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2394 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2395 # warn "\nDetected CIGAR string: $cigar\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2396 # warn "Length of methylation call: ",length $meth_call,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2397 # warn "number of operations: ",scalar @ops,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2398 # warn "number of length digits: ",scalar @len,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2399 # print @comp_cigar,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2400 # print "$meth_call\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2401 # sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2402 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2403
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2404 ### adjusting the start position for all reads mapping to the reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2405 if ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2406
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2407 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2408 # print @comp_cigar,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2409
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2410 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2411
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2412 if ($cigar){ ### SAM format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2413 my $MD_count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2414 foreach (@comp_cigar){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2415 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2416 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2417 $start += $MD_count - 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2418 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2419 else{ ### vanilla format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2420 $start += length($meth_call)-1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2421 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2422 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2423 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2424
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2425 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2426
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2427 ### single-file CpG and other-context output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2428 if ($full and $merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2429 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2430 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2431
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2432 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2433 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2434 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2435 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2436 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2437 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2438
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2439 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2440 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2441 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2442 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2443 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2444 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2445 elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2446 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2447 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2448 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2449 elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2450 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2451 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2452 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2453 elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2454 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2455 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2456 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2457 elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2458 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2459 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2460 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2461 elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2462 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2463 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2464 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2465 elsif ($methylation_calls[$index] eq '.') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2466 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2467 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2468 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2469 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2470 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2471 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2472 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2473
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2474 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2475 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2476 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2477
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2478 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2479 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2480 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2481 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2482 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2483 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2484
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2485 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2486 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2487 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2488 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2489 elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2490 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2491 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2492 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2493 elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2494 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2495 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2496 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2497 elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2498 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2499 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2500 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2501 elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2502 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2503 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2504 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2505 elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2506 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2508 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2509 elsif ($methylation_calls[$index] eq '.'){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2510 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2511 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2512 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2513 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2514 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2515 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2516 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2517 die "The strand information was neither + nor -: $strand\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2518 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2519 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2520
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2521 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2522 elsif ($merge_non_CpG) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2523 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2524 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2525 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2526 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2527
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2528 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2529 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2530 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2531 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2532 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2533
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2534 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2535 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2536 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2537 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2538 elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2539 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2541 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2542 elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2543 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2544 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2545 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2546 elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2547 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2548 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2549 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2550 elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2551 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2552 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2553 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2554 elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2555 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2556 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2557 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2558 elsif ($methylation_calls[$index] eq '.') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2559 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2560 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2561 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2562 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2563 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2564 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2565 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2566
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2567 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2568 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2569 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2570
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2571 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2572 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2573 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2574 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2575 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2576
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2577 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2578 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2579 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2580 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2581 elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2582 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2584 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2585 elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2586 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2587 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2588 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2589 elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2590 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2591 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2592 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2593 elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2594 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2596 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2597 elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2598 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2599 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2600 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2601 elsif ($methylation_calls[$index] eq '.') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2602 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2603 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2604 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2605 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2606 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2607 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2608 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2609 die "The strand information was neither + nor -: $strand\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2610 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2611 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2612
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2613 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2614
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2615 elsif ($full) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2616 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2617 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2618 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2619 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2620
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2621 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2622 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2623 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2624 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2625 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2626
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2627 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2628 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2629 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2630 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2631 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2632 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2633 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2634 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2635 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2636 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2637 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2638 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2639 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2640 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2641 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2642 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2643 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2644 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2645 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2646 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2647 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2648 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2649 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2650 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2651 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2652 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2653
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2654 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2655 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2656 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2657
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2658 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2659 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2660 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2661 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2662 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2663
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2664 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2665 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2666 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2667 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2668 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2669 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2670 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2671 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2672 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2673 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2674 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2675 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2676 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2677 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2678 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2679 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2680 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2681 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2682 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2683 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2684 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2685 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2686 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2687 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2688 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2689 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2690 die "The read had a strand orientation which was neither + nor -: $strand\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2691 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2692 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2693
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2694 ### strand-specific methylation output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2695 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2696 if ($strand eq '+') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2697 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2698 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2699 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2700
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2701 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2702 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2703 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2704 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2705 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2706
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2707 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2708 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2709 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2710 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2711 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2712 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2713 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2714 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2716 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2717 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2719 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2720 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2721 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2722 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2723 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2724 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2725 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2726 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2727 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2729 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2730 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2731 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2732 elsif ($strand eq '-') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2733
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2734 for my $index (0..$#methylation_calls) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2735 ### methylated Cs (any context) will receive a forward (+) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2736 ### not methylated Cs (any context) will receive a reverse (-) orientation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2737
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2738 if ($cigar){ # only needed for SAM files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2739 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2740 $cigar_offset += $cigar_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2741 $pos_offset += $pos_mod;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2742 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2743
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2744 if ($methylation_calls[$index] eq 'X') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2745 $counting{total_meCHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2746 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2747 } elsif ($methylation_calls[$index] eq 'x') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2748 $counting{total_unmethylated_CHG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2749 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2750 } elsif ($methylation_calls[$index] eq 'Z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2751 $counting{total_meCpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2752 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2753 } elsif ($methylation_calls[$index] eq 'z') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2754 $counting{total_unmethylated_CpG_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2755 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2756 } elsif ($methylation_calls[$index] eq 'H') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2757 $counting{total_meCHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2758 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2759 } elsif ($methylation_calls[$index] eq 'h') {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2760 $counting{total_unmethylated_CHH_count}++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2761 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2762 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2763 elsif ($methylation_calls[$index] eq '.') {}
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2764 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2765 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2766 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2767 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2768 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2769 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2770 die "The strand information was neither + nor -: $strand\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2771 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2772 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2773 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2774
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2775
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2776
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2777 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2778 ### bismark2bedGaph section - START
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2779 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2780
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2781 sub process_bedGraph_output{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2782 warn "="x64,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2783 warn "Methylation information will now be written into a bedGraph file\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2784 warn "="x64,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2785 sleep (2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2786
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2787 ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2788 foreach my $fh (keys %fhs) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2789 if ($fh =~ /^[1230]$/) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2790 foreach my $context (keys %{$fhs{$fh}}) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2791 close $fhs{$fh}->{$context} or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2792 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2793 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2794 close $fhs{$fh} or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2795 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2796 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2797
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2798 ### deciding which files to use for bedGraph conversion
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2799 foreach my $filename (@sorting_files){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2800 # warn "$filename\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2801 if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2802 $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2803 # warn "$filename\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2804 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2805
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2806 if ($CX_context){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2807 push @bedfiles,$filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2808 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2809 else{ ## CpG context only (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2810 if ($filename =~ /^CpG_/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2811 push @bedfiles,$filename;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2812 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2813 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2814 # skipping CHH or CHG files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2815 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2816 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2817 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2818
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2819 warn "Using the following files as Input:\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2820 print join ("\t",@bedfiles),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2821 sleep (2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2822
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2823 my %temp_fhs;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2824 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2825
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2826 ### changing to the output directory
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2827 unless ($output_dir eq ''){ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2828 chdir $output_dir or die "Failed to change directory to $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2829 warn "Changed directory to $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2830 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2831
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2832 foreach my $infile (@bedfiles) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2833
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2834 if ($remove) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2835 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2836
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2837 open (READ,$infile) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2838
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2839 my $removed_spaces_outfile = $infile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2840 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2841
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2842 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2843
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2844 unless ($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2845 $_ = <READ>; ### Bismark version header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2846 print REM $_; ### Bismark version header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2847 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2848
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2849 while (<READ>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2850 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2851 my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2852 $id =~ s/\s+/_/g;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2853 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2854 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2855
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2856 close READ or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2857 close REM or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2858
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2859 ### changing the infile name to the new file without spaces
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2860 $infile = $removed_spaces_outfile;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2861 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2862
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2863 warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2864 open (IN,$infile) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2865
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2866 ## always ignoring the version header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2867 unless ($no_header){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2868 $_ = <IN>; ### Bismark version header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2869 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2870
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2871 while (<IN>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2872 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2873 my ($chr) = (split (/\t/))[2];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2874
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2875 unless (exists $temp_fhs{$chr}) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2876 open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2877 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2878 print {$temp_fhs{$chr}} "$_\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2879 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2880
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2881 warn "Finished writing out individual chromosome files for $infile\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2882 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2883 warn "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2884
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2885 @temp_files = <*.meth_extractor.temp>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2886
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2887 warn "Collecting temporary chromosome file information...\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2888 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2889 warn "processing the following input file(s):\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2890 warn join ("\n",@temp_files),"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2891 sleep (1);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2892
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2893 foreach my $in (@temp_files) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2894 warn "Sorting input file $in by positions\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2895 open my $ifh, "sort -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2896 # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2897
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2898 ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2899
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2900 my $name;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2901 my $meth_state;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2902 my $chr = "";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2903 my $pos = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2904 my $meth_state2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2905
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2906 my $last_pos;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2907 my $last_chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2908
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2909 ############################################# }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2910
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2911 while (my $line = <$ifh>) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2912 next if $line =~ /^Bismark/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2913 chomp $line;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2914
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2915 ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2916
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2917 $last_chr = $chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2918 $last_pos = $pos;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2919 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2920
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2921 if (($last_pos ne $pos) || ($last_chr ne $chr)) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2922 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2923 @methylcalls = qw (0 0 0);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2924 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2925
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2926 ############################################# }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2927
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2928 my $validated = validate_methylation_call($meth_state, $meth_state2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2929 unless($validated){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2930 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2931 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2932 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2933 if ($meth_state eq "+") {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2934 $methylcalls[0]++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2935 $methylcalls[2]++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2936 } else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2937 $methylcalls[1]++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2938 $methylcalls[2]++;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2939 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2940 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2941
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2942 ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2943
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2944 $last_chr = $chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2945 $last_pos = $pos;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2946 if ($methylcalls[2] > 0) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2947 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2948 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2949 ############################################# }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2950
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2951 close $ifh or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2952
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2953 @methylcalls = qw (0 0 0); # resetting @methylcalls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2954
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2955 ### deleting temporary files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2956 my $delete = unlink $in;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2957 if ($delete) {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2958 warn "Successfully deleted the temporary input file $in\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2959 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2960 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2961 warn "The temporary inputfile $in could not be deleted $!\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2962 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2963 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2964 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2965
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2966 sub generate_output{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2967 my $methcount = $methylcalls[0];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2968 my $nonmethcount = $methylcalls[1];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2969 my $totalcount = $methylcalls[2];
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2970 my $last_chr = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2971 my $last_pos = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2972 croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2973 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2974
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2975 ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2976
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2977 my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2978 my $meth_percentage;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2979 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2980 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2981 if (defined $meth_percentage){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2982 if ($counts){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2983 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2984 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2985 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2986 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2987 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2988 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2989 ############################################# }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2990 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2991
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2992 sub validate_methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2993 my $meth_state = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2994 croak "Missing (+/-) methylation call" unless defined $meth_state;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2995 my $meth_state2 = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2996 croak "Missing alphabetical methylation call" unless defined $meth_state2;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2997 my $is_consistent;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2998 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
2999 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3000 return 1 if $is_consistent;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3001 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3002 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3003
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3004 sub check_CpG_methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3005 my $meth1 = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3006 my $meth2 = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3007 return 1 if($meth1 eq "+" && $meth2 eq "Z");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3008 return 1 if($meth1 eq "-" && $meth2 eq "z");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3009 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3010 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3011
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3012 sub check_nonCpG_methylation_call{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3013 my $meth1 = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3014 my $meth2 = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3015 return 1 if($meth1 eq "+" && $meth2 eq "C");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3016 return 1 if($meth1 eq "+" && $meth2 eq "X");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3017 return 1 if($meth1 eq "+" && $meth2 eq "H");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3018 return 1 if($meth1 eq "-" && $meth2 eq "c");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3019 return 1 if($meth1 eq "-" && $meth2 eq "x");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3020 return 1 if($meth1 eq "-" && $meth2 eq "h");
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3021 return 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3022 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3023
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3024 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3025 ### bismark2bedGaph section - END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3026 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3027
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3028
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3029
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3030
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3031
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3032
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3033 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3034 ### genome-wide cytosine methylation report - START
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3035 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3036
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3037 sub generate_genome_wide_cytosine_report {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3038
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3039 warn "="x78,"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3040 warn "Methylation information will now be written into a genome-wide cytosine report\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3041 warn "="x78,"\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3042 sleep (2);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3043
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3044 ### changing to the output directory again
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3045 unless ($output_dir eq ''){ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3046 chdir $output_dir or die "Failed to change directory to $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3047 # warn "Changed directory to $output_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3048 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3049
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3050 my $in = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3051 open (IN,$in) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3052
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3053 my $cytosine_out = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3054
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3055 if ($CX_context){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3056 $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3057 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3058 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3059 $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3060 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3061
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3062 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3063 unless ($split_by_chromosome){ ### writing all output to a single file (default)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3064 open (CYT,'>',$cytosine_out) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3065 warn "Writing genome-wide cytosine report to: $cytosine_out\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3066 sleep (3);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3067 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3068
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3069 my $last_chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3070 my %chr; # storing reads for one chromosome at a time
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3071
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3072 my $count = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3073 while (<IN>){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3074 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3075 ++$count;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3076 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3077
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3078 # defining the first chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3079 unless (defined $last_chr){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3080 $last_chr = $chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3081 # warn "Storing all covered cytosine positions for chromosome: $chr\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3082 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3083
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3084 if ($chr eq $last_chr){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3085 $chr{$chr}->{$start}->{meth} = $meth;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3086 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3087 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3088 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3089 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3090
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3091 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3092 my $chromosome_out = $cytosine_out;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3093 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3094 open (CYT,'>',$chromosome_out) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3095 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3096
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3097 while ( $chromosomes{$last_chr} =~ /([CG])/g){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3098
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3099 my $tri_nt = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3100 my $context = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3101 my $pos = pos$chromosomes{$last_chr};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3102
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3103 my $strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3104 my $meth = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3105 my $nonmeth = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3106
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3107 if ($1 eq 'C'){ # C on forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3108 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3109 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3110 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3111 elsif ($1 eq 'G'){ # C on reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3112 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3113 $tri_nt = reverse $tri_nt;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3114 $tri_nt =~ tr/ACTG/TGAC/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3115 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3116 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3117 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3118
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3119 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3120 $meth = $chr{$last_chr}->{$pos-1}->{meth};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3121 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3122 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3123
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3124 ### determining cytosine context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3125 if ($tri_nt =~ /^CG/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3126 $context = 'CG';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3127 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3128 elsif ($tri_nt =~ /^C.{1}G$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3129 $context = 'CHG';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3130 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3131 elsif ($tri_nt =~ /^C.{2}$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3132 $context = 'CHH';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3133 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3134 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3135 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3136 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3137 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3138
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3139 if ($CpG_only){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3140 if ($tri_nt =~ /^CG/){ # CpG context is the default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3141 if ($zero){ # zero based coordinates
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3142 $pos -= 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3143 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3144 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3145 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3146 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3147 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3148 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3149 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3150 else{ ## all cytosines, specified with --CX
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3151 if ($zero){ # zero based coordinates
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3152 $pos -= 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3153 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3154 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3155 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3156 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3157 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3158 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3159 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3160
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3161 %chr = (); # resetting the hash
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3162
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3163 # new first entry
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3164 $last_chr = $chr;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3165 $chr{$chr}->{$start}->{meth} = $meth;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3166 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3167 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3168 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3169
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3170 # Last found chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3171 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3172
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3173 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3174 my $chromosome_out = $cytosine_out;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3175 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3176 open (CYT,'>',$chromosome_out) or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3177 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3178
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3179 while ( $chromosomes{$last_chr} =~ /([CG])/g){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3180
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3181 my $tri_nt;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3182 my $context;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3183 my $pos = pos$chromosomes{$last_chr};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3184
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3185 my $strand;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3186 my $meth = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3187 my $nonmeth = 0;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3188
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3189 if ($1 eq 'C'){ # C on forward strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3190 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3191 $strand = '+';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3192 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3193 elsif ($1 eq 'G'){ # C on reverse strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3194 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3195 $tri_nt = reverse $tri_nt;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3196 $tri_nt =~ tr/ACTG/TGAC/;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3197 $strand = '-';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3198 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3199
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3200 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3201 $meth = $chr{$last_chr}->{$pos-1}->{meth};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3202 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3203 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3204
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3205 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3206
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3207 ### determining cytosine context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3208 if ($tri_nt =~ /^CG/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3209 $context = 'CG';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3210 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3211 elsif ($tri_nt =~ /^C.{1}G$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3212 $context = 'CHG';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3213 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3214 elsif ($tri_nt =~ /^C.{2}$/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3215 $context = 'CHH';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3216 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3217 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3218 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3219 next;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3220 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3221
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3222 if ($CpG_only){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3223 if ($tri_nt =~ /^CG/){ # CpG context is the default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3224 if ($zero){ # zero-based coordinates
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3225 $pos -= 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3226 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3227 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3228 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3229 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3230 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3231 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3232 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3233 else{ ## all cytosines, specified with --CX
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3234 if ($zero){ # zero based coordinates
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3235 $pos -= 1;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3236 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3237 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3238 else{ # default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3239 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3240 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3241 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3242 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3243 close CYT or die $!;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3244 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3245
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3246
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3247 sub read_genome_into_memory{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3248
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3249 ## reading in and storing the specified genome in the %chromosomes hash
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3250 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3251 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3252
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3253 my @chromosome_filenames = <*.fa>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3254
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3255 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3256 unless (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3257 @chromosome_filenames = <*.fasta>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3258 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3259 unless (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3260 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3261 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3262
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3263 foreach my $chromosome_filename (@chromosome_filenames){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3264
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3265 # skipping the tophat entire mouse genome fasta file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3266 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3267
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3268 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3269 ### first line needs to be a fastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3270 my $first_line = <CHR_IN>;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3271 chomp $first_line;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3272 $first_line =~ s/\r//; # removing /r carriage returns
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3273
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3274 ### Extracting chromosome name from the FastA header
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3275 my $chromosome_name = extract_chromosome_name($first_line);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3276
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3277 my $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3278 while (<CHR_IN>){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3279 chomp;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3280 $_ =~ s/\r//; # removing /r carriage returns
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3281
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3282 if ($_ =~ /^>/){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3283 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3284 if (exists $chromosomes{$chromosome_name}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3285 warn "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3286 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3287 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3288 else {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3289 if (length($sequence) == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3290 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3291 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3292 warn "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3293 $chromosomes{$chromosome_name} = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3294 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3295 ### resetting the sequence variable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3296 $sequence = '';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3297 ### setting new chromosome name
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3298 $chromosome_name = extract_chromosome_name($_);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3299 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3300 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3301 $sequence .= uc$_;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3302 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3303 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3304
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3305 if (exists $chromosomes{$chromosome_name}){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3306 warn "chr $chromosome_name (",length $sequence ," bp)\t";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3307 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3308 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3309 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3310 if (length($sequence) == 0){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3311 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3312 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3313 warn "chr $chromosome_name (",length $sequence ," bp)\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3314 $chromosomes{$chromosome_name} = $sequence;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3315 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3316 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3317 warn "\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3318 chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3319 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3320
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3321 sub extract_chromosome_name {
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3322 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3323 my $fasta_header = shift;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3324 if ($fasta_header =~ s/^>//){
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3325 my ($chromosome_name) = split (/\s+/,$fasta_header);
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3326 return $chromosome_name;
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3327 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3328 else{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3329 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3330 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3331 }
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3332
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3333 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3334 ### genome-wide cytosine methylation report - END
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3335 #######################################################################################################################################
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3336
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3337
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3338
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3339
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3340 sub print_helpfile{
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3341
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3342 print << 'HOW_TO';
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3343
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3344
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3345 DESCRIPTION
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3346
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3347 The following is a brief description of all options to control the Bismark
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3348 methylation extractor. The script reads in a bisulfite read alignment results file
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3349 produced by the Bismark bisulfite mapper and extracts the methylation information
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3350 for individual cytosines. This information is found in the methylation call field
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3351 which can contain the following characters:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3352
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3353 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3354 ~~~ X for methylated C in CHG context (was protected) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3355 ~~~ x for not methylated C CHG (was converted) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3356 ~~~ H for methylated C in CHH context (was protected) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3357 ~~~ h for not methylated C in CHH context (was converted) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3358 ~~~ Z for methylated C in CpG context (was protected) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3359 ~~~ z for not methylated C in CpG context (was converted) ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3360 ~~~ . for any bases not involving cytosines ~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3361 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3362
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3363 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3364 context (this distinction is actually already made in Bismark itself). As the methylation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3365 information for every C analysed can produce files which easily have tens or even hundreds of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3366 millions of lines, file sizes can become very large and more difficult to handle. The C
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3367 methylation info additionally splits cytosine methylation calls up into one of the four possible
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3368 strands a given bisulfite read aligned against:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3369
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3370 OT original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3371 CTOT complementary to original top strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3372
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3373 OB original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3374 CTOB complementary to original bottom strand
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3375
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3376 Thus, by default twelve individual output files are being generated per input file (unless
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3377 --comprehensive is specified, see below). The output files can be imported into a genome
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3378 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3379 unless the bisulfite reads were generated preserving directionality it doesn't make any
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3380 sense to look at the data in a strand-specific manner). Strand-specific output files can
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3381 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3382 will be generated. For both the strand-specific and comprehensive outputs there is also
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3383 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3384
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3385
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3386 The output files are in the following format (tab delimited):
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3387
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3388 <sequence_id> <strand> <chromosome> <position> <methylation call>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3389
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3390
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3391 USAGE: methylation_extractor [options] <filenames>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3392
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3393
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3394 ARGUMENTS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3395
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3396 <filenames> A space-separated list of Bismark result files in SAM format from
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3397 which methylation information is extracted for every cytosine in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3398 the reads. For alignment files in the older custom Bismark output
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3399 see option '--vanilla'.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3400
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3401 OPTIONS:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3402
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3403 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3404 read data. Specifying either --single-end or --paired-end is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3405 mandatory.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3406
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3407 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3408 read data. Specifying either --paired-end or --single-end is
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3409 mandatory.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3410
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3411 --vanilla The Bismark result input file(s) are in the old custom Bismark format
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3412 (up to version 0.5.x) and not in SAM format which is the default as
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3413 of Bismark version 0.6.x or higher. Default: OFF.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3414
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3415 --no_overlap For paired-end reads it is theoretically possible that read_1 and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3416 read_2 overlap. This option avoids scoring overlapping methylation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3417 calls twice (only methylation calls of read 1 are used for in the process
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3418 since read 1 has historically higher quality basecalls than read 2).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3419 Whilst this option removes a bias towards more methylation calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3420 in the center of sequenced fragments it may de facto remove a sizable
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3421 proportion of the data. This option is highly recommended for paired-end
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3422 data.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3423
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3424 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3425 methylation call string. This can remove e.g. a restriction enzyme site
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3426 at the start of each read.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3427
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3428 --comprehensive Specifying this option will merge all four possible strand-specific
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3429 methylation info into context-dependent output files. The default
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3430 contexts are:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3431 - CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3432 - CHG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3433 - CHH context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3434
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3435 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3436 strand-specific output files (default) for Cs in
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3437 - CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3438 - non-CpG context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3439
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3440 --report Prints out a short methylation summary as well as the paramaters used to run
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3441 this script.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3442
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3443 --no_header Suppresses the Bismark version header line in all output files for more convenient
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3444 batch processing.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3445
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3446 -o/--output DIR Allows specification of a different output directory (absolute or relative
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3447 path). If not specified explicitely, the output will be written to the current directory.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3448
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3449 --version Displays version information.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3450
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3451 -h/--help Displays this help file and exits.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3452
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3453
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3454
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3455 bedGraph specific options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3456
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3457 --bedGraph After finishing the methylation extraction, the methylation output is written into a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3458 sorted bedGraph file that reports the position of a given cytosine and its methylation
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3459 state (in %, seem details below). The methylation extractor output is temporarily split up into
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3460 temporary files, one per chromosome (written into the current directory or folder
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3461 specified with -o/--output); these temp files are then used for sorting and deleted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3462 afterwards. By default, only cytosines in CpG context will be sorted. The option
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3463 '--CX_context' may be used to report all cyosines irrespective of sequence context
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3464 (this will take MUCH longer!).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3465
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3466
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3467 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3468 before its methylation percentage is reported. Default: 1.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3469
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3470 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3471
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3472
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3473 --counts Adds two additional columns to the output file to enable further calculations:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3474 col 5: number of methylated calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3475 col 6: number of unmethylated calls
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3476 This option is required if '--cytosine_report' is specified (and will be set automatically if
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3477 necessary).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3478
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3479 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3480 in the experiment irrespective of its sequence context. This applies to both forward and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3481 reverse strands. Please be aware that this option may generate large temporary and output files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3482 and may take a long time to sort (up to many hours). Default: OFF.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3483 (i.e. Default = CpG context only).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3484
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3485
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3486
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3487 Genome-wide cytosine methylation report specific options:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3488
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3489 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3490 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3491 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3492 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3493 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3494 covered).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3495
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3496 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3497 its context. This applies to both forward and reverse strands. Please be aware that this will
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3498 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3499 Default: OFF (i.e. Default = CpG context only).
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3500
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3501 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3502
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3503 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3504 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3505
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3506 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3507 will be named to include the input filename and the chromosome number.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3508
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3509
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3510
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3511 OUTPUT:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3512
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3513 The bismark_methylation_extractor output is in the form:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3514 ========================================================
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3515 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3516
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3517 * Methylated cytosines receive a '+' orientation,
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3518 * Unmethylated cytosines receive a '-' orientation.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3519
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3520
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3521
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3522 The bedGraph output (optional) looks like this (tab-delimited):
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3523 ===============================================================
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3524 <chromosome> <start position> <end position> <methylation percentage>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3525
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3526
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3527
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3528 The genome-wide cytosine methylation output file is tab-delimited in the following format:
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3529 ==========================================================================================
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3530 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3531
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3532
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3533
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3534 This script was last modified on 02 Oct 2012.
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3535
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3536 HOW_TO
183de9d00131 add indices.loc files
bjoern-gruening
parents:
diff changeset
3537 }