Mercurial > repos > bgruening > bismark
comparison bismark_deduplicate/deduplicate_bismark @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 | |
6 | |
7 ### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification | |
8 ### Paired-end alignments are considered a duplicate if both partner reasd start and end at the exact same position | |
9 | |
10 ### May 13, 2013 | |
11 ### Changed the single-end trimming behavior so that only the start coordinate will be used. This avoids duplicate reads that have been trimmed to a varying extent | |
12 ### Changed the way of determining the end of reads in SAM format to using the CIGAR string if the read contains InDels | |
13 | |
14 ### 16 July 2013 | |
15 ### Adding a new deduplication mode for barcoded RRBS-Seq | |
16 | |
17 ### 27 Sept 2013 | |
18 ### Added close statement for all output filehandles (which should probably have been there from the start...) | |
19 | |
20 ### 8 Jan 2015 | |
21 ### to detect paired-end command from the @PG line we are no requiring spaces before and after the -1 or -2 | |
22 | |
23 ### 09 Mar 2015 | |
24 ### Removing newline characters also from the read conversion flag in case the tags had been reordered and are now present in the very last column | |
25 | |
26 ### 19 08 2015 | |
27 ### Hiding the option --representative from view to discourage people from using it (it was nearly always not what they wanted to do anyway). It should still work | |
28 ### for alignments that do not contain any indels | |
29 ### Just for reference, here is the the text: | |
30 ### print "--representative\twill browse through all sequences and print out the sequence with the most representative (as in most frequent) methylation call for any given position. Note that this is very likely the most highly amplified PCR product for a given sequence\n\n"; | |
31 | |
32 | |
33 my $dedup_version = 'v0.16.3'; | |
34 | |
35 my $help; | |
36 my $representative; | |
37 my $single; | |
38 my $paired; | |
39 my $global_single; | |
40 my $global_paired; | |
41 my $vanilla; | |
42 my $samtools_path; | |
43 my $bam; | |
44 my $rrbs; | |
45 my $version; | |
46 | |
47 my $command_line = GetOptions ('help' => \$help, | |
48 'representative' => \$representative, | |
49 's|single' => \$global_single, | |
50 'p|paired' => \$global_paired, | |
51 'vanilla' => \$vanilla, | |
52 'samtools_path=s' => \$samtools_path, | |
53 'bam' => \$bam, | |
54 'barcode' => \$rrbs, | |
55 'version' => \$version, | |
56 ); | |
57 | |
58 die "Please respecify command line options\n\n" unless ($command_line); | |
59 | |
60 if ($help){ | |
61 print_helpfile(); | |
62 exit; | |
63 } | |
64 | |
65 if ($version){ | |
66 print << "VERSION"; | |
67 | |
68 Bismark Deduplication Module | |
69 | |
70 Deduplicator Version: $dedup_version | |
71 Copyright 2010-16 Felix Krueger, Babraham Bioinformatics | |
72 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
73 | |
74 | |
75 VERSION | |
76 exit; | |
77 } | |
78 | |
79 | |
80 | |
81 my @filenames = @ARGV; | |
82 | |
83 unless (@filenames){ | |
84 print "Please provide one or more Bismark output files for deduplication\n\n"; | |
85 sleep (2); | |
86 print_helpfile(); | |
87 exit; | |
88 } | |
89 | |
90 | |
91 ### OPTIONS | |
92 unless ($global_single or $global_paired){ | |
93 if ($vanilla){ | |
94 die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n"; | |
95 } | |
96 warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n"; | |
97 } | |
98 | |
99 if ($global_paired){ | |
100 if ($global_single){ | |
101 die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n"; | |
102 } | |
103 if ($vanilla){ | |
104 | |
105 if ($rrbs){ | |
106 die "Barcode deduplication only works with Bismark SAM (or BAM) output (in attempt to phase out the vanilla format)\n"; | |
107 } | |
108 | |
109 warn "Processing paired-end custom Bismark output file(s):\n"; | |
110 warn join ("\t",@filenames),"\n\n"; | |
111 } | |
112 else{ | |
113 warn "Processing paired-end Bismark output file(s) (SAM format):\n"; | |
114 warn join ("\t",@filenames),"\n\n"; | |
115 } | |
116 } | |
117 else{ | |
118 if ($vanilla){ | |
119 warn "Processing single-end custom Bismark output file(s):\n"; | |
120 warn join ("\t",@filenames),"\n\n"; | |
121 } | |
122 else{ | |
123 warn "Processing single-end Bismark output file(s) (SAM format):\n"; | |
124 warn join ("\t",@filenames),"\n\n"; | |
125 } | |
126 } | |
127 | |
128 ### PATH TO SAMTOOLS | |
129 if (defined $samtools_path){ | |
130 # if Samtools was specified as full command | |
131 if ($samtools_path =~ /samtools$/){ | |
132 if (-e $samtools_path){ | |
133 # Samtools executable found | |
134 } | |
135 else{ | |
136 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
137 } | |
138 } | |
139 else{ | |
140 unless ($samtools_path =~ /\/$/){ | |
141 $samtools_path =~ s/$/\//; | |
142 } | |
143 $samtools_path .= 'samtools'; | |
144 if (-e $samtools_path){ | |
145 # Samtools executable found | |
146 } | |
147 else{ | |
148 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
149 } | |
150 } | |
151 } | |
152 # Check whether Samtools is in the PATH if no path was supplied by the user | |
153 else{ | |
154 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH | |
155 $samtools_path = `which samtools`; | |
156 chomp $samtools_path; | |
157 } | |
158 } | |
159 | |
160 if ($bam){ | |
161 if (defined $samtools_path){ | |
162 $bam = 1; | |
163 } | |
164 else{ | |
165 warn "No Samtools found on your system, writing out a gzipped SAM file instead\n"; | |
166 $bam = 2; | |
167 } | |
168 } | |
169 else{ | |
170 $bam = 0; | |
171 } | |
172 | |
173 | |
174 if ($representative){ | |
175 warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n"; | |
176 sleep (1); | |
177 } | |
178 elsif($rrbs){ | |
179 warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n"; | |
180 sleep (1); | |
181 } | |
182 else{ # default; random (=first) alignment | |
183 warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n"; | |
184 sleep (1); | |
185 } | |
186 | |
187 foreach my $file (@filenames){ | |
188 | |
189 if ($global_single){ | |
190 $paired = 0; | |
191 $single = 1; | |
192 } | |
193 elsif($global_paired){ | |
194 $paired = 1; | |
195 $single = 0; | |
196 } | |
197 | |
198 unless($global_single or $global_paired){ | |
199 | |
200 warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1); | |
201 | |
202 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file | |
203 if ($file =~ /\.gz$/){ | |
204 open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
205 } | |
206 elsif ($file =~ /\.bam$/){ | |
207 open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
208 } | |
209 else{ | |
210 open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; | |
211 } | |
212 while (<DETERMINE>){ | |
213 last unless (/^\@/); | |
214 if ($_ =~ /^\@PG/){ | |
215 # warn "found the \@PG line:\n"; | |
216 # warn "$_"; | |
217 | |
218 if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ | |
219 warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1); | |
220 $paired = 1; | |
221 $single = 0; | |
222 } | |
223 else{ | |
224 warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1); | |
225 $paired = 0; | |
226 $single = 1; | |
227 } | |
228 } | |
229 } | |
230 close DETERMINE or warn "$!\n"; | |
231 } | |
232 | |
233 if ($file =~ /(\.bam$|\.sam$)/){ | |
234 bam_isEmpty($file); | |
235 } | |
236 | |
237 | |
238 ### OPTIONS | |
239 unless ($single or $paired){ | |
240 die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n"; | |
241 } | |
242 | |
243 ### | |
244 unless ($vanilla){ | |
245 if ($paired){ | |
246 test_positional_sorting($file); | |
247 } | |
248 } | |
249 | |
250 | |
251 ### writing to a report file | |
252 my $report = $file; | |
253 | |
254 $report =~ s/\.gz$//; | |
255 $report =~ s/\.sam$//; | |
256 $report =~ s/\.bam$//; | |
257 $report =~ s/\.txt$//; | |
258 $report =~ s/$/.deduplication_report.txt/; | |
259 | |
260 open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n"; | |
261 | |
262 | |
263 ### for representative methylation calls we need to discriminate between single-end and paired-end files as the latter have 2 methylation call strings | |
264 if($representative){ | |
265 deduplicate_representative($file); | |
266 } | |
267 | |
268 elsif($rrbs){ | |
269 deduplicate_barcoded_rrbs($file); | |
270 } | |
271 | |
272 ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option | |
273 else{ | |
274 | |
275 my %unique_seqs; | |
276 my %positions; | |
277 | |
278 if ($file =~ /\.gz$/){ | |
279 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
280 } | |
281 elsif ($file =~ /\.bam$/){ | |
282 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
283 } | |
284 else{ | |
285 open (IN,$file) or die "Unable to read from $file: $!\n"; | |
286 } | |
287 | |
288 my $outfile = $file; | |
289 $outfile =~ s/\.gz$//; | |
290 $outfile =~ s/\.sam$//; | |
291 $outfile =~ s/\.bam$//; | |
292 $outfile =~ s/\.txt$//; | |
293 | |
294 if ($vanilla){ | |
295 $outfile =~ s/$/_deduplicated.txt/; | |
296 } | |
297 else{ | |
298 if ($bam == 1){ | |
299 $outfile =~ s/$/.deduplicated.bam/; | |
300 } | |
301 elsif ($bam == 2){ | |
302 $outfile =~ s/$/.deduplicated.sam.gz/; | |
303 } | |
304 else{ | |
305 $outfile =~ s/$/.deduplicated.sam/; | |
306 } | |
307 } | |
308 if ($bam == 1){ | |
309 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
310 } | |
311 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
312 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
313 } | |
314 | |
315 else{ | |
316 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; | |
317 } | |
318 | |
319 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output | |
320 if ($vanilla){ | |
321 $_ = <IN>; # Bismark version header | |
322 print OUT; # Printing the Bismark version to the de-duplicated file again | |
323 } | |
324 my $count = 0; | |
325 my $unique_seqs = 0; | |
326 my $removed = 0; | |
327 | |
328 while (<IN>){ | |
329 | |
330 if ($count == 0){ | |
331 if ($_ =~ /^Bismark version:/){ | |
332 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; | |
333 sleep (2); | |
334 print_helpfile(); | |
335 exit; | |
336 } | |
337 } | |
338 | |
339 ### if this was a SAM file we ignore header lines | |
340 unless ($vanilla){ | |
341 if (/^\@\w{2}\t/){ | |
342 print "skipping header line:\t$_"; | |
343 print OUT "$_"; # Printing the header lines again into the de-duplicated file | |
344 next; | |
345 } | |
346 } | |
347 | |
348 ++$count; | |
349 my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths | |
350 | |
351 my ($strand,$chr,$start,$end,$cigar); | |
352 my $line1; | |
353 | |
354 if ($vanilla){ | |
355 ($strand,$chr,$start,$end) = (split (/\t/))[1,2,3,4]; | |
356 } | |
357 else{ # SAM format | |
358 ($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand | |
359 | |
360 ### SAM single-end | |
361 if ($single){ | |
362 | |
363 if ($strand == 0 ){ | |
364 ### read aligned to the forward strand. No action needed | |
365 } | |
366 elsif ($strand == 16){ | |
367 ### read is on reverse strand | |
368 | |
369 $start -= 1; # only need to adjust this once | |
370 | |
371 # for InDel free matches we can simply use the M number in the CIGAR string | |
372 if ($cigar =~ /^(\d+)M$/){ # linear match | |
373 $start += $1; | |
374 } | |
375 | |
376 else{ | |
377 # parsing CIGAR string | |
378 my @len = split (/\D+/,$cigar); # storing the length per operation | |
379 my @ops = split (/\d+/,$cigar); # storing the operation | |
380 shift @ops; # remove the empty first element | |
381 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
382 | |
383 # warn "CIGAR string; $cigar\n"; | |
384 ### determining end position of a read | |
385 foreach my $index(0..$#len){ | |
386 if ($ops[$index] eq 'M'){ # standard matching bases | |
387 $start += $len[$index]; | |
388 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
389 } | |
390 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
391 # warn "Operation is 'I', next\n"; | |
392 } | |
393 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
394 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
395 $start += $len[$index]; | |
396 } | |
397 else{ | |
398 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
399 } | |
400 } | |
401 } | |
402 } | |
403 $composite = join (":",$strand,$chr,$start); | |
404 } | |
405 elsif($paired){ | |
406 | |
407 ### storing the current line | |
408 $line1 = $_; | |
409 | |
410 my $read_conversion; | |
411 my $genome_conversion; | |
412 | |
413 while ( /(XR|XG):Z:([^\t]+)/g ) { | |
414 my $tag = $1; | |
415 my $value = $2; | |
416 | |
417 if ($tag eq "XR") { | |
418 $read_conversion = $value; | |
419 $read_conversion =~ s/\r//; | |
420 chomp $read_conversion; | |
421 } elsif ($tag eq "XG") { | |
422 $genome_conversion = $value; | |
423 $genome_conversion =~ s/\r//; | |
424 chomp $genome_conversion; | |
425 } | |
426 } | |
427 die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion); | |
428 | |
429 | |
430 my $index; | |
431 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
432 $index = 0; | |
433 $strand = '+'; | |
434 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
435 $index = 1; | |
436 $strand = '-'; | |
437 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
438 $index = 2; | |
439 $strand = '+'; | |
440 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
441 $index = 3; | |
442 $strand = '-'; | |
443 } else { | |
444 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
445 } | |
446 | |
447 # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2 | |
448 if ($index == 0 or $index == 2){ | |
449 | |
450 ### reading in the next line | |
451 $_ = <IN>; | |
452 # the only thing we need is the end position | |
453 ($end,my $cigar_2) = (split (/\t/))[3,5]; | |
454 | |
455 $end -= 1; # only need to adjust this once | |
456 | |
457 # for InDel free matches we can simply use the M number in the CIGAR string | |
458 if ($cigar_2 =~ /^(\d+)M$/){ # linear match | |
459 $end += $1; | |
460 } | |
461 else{ | |
462 # parsing CIGAR string | |
463 my @len = split (/\D+/,$cigar_2); # storing the length per operation | |
464 my @ops = split (/\d+/,$cigar_2); # storing the operation | |
465 shift @ops; # remove the empty first element | |
466 die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops); | |
467 | |
468 # warn "CIGAR string; $cigar_2\n"; | |
469 ### determining end position of the read | |
470 foreach my $index(0..$#len){ | |
471 if ($ops[$index] eq 'M'){ # standard matching bases | |
472 $end += $len[$index]; | |
473 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
474 } | |
475 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
476 # warn "Operation is 'I', next\n"; | |
477 } | |
478 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
479 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
480 $end += $len[$index]; | |
481 } | |
482 else{ | |
483 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
484 } | |
485 } | |
486 } | |
487 } | |
488 else{ | |
489 # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line | |
490 | |
491 $end = $start - 1; # need to adjust this only once | |
492 | |
493 # for InDel free matches we can simply use the M number in the CIGAR string | |
494 if ($cigar =~ /^(\d+)M$/){ # linear match | |
495 $end += $1; | |
496 } | |
497 else{ | |
498 # parsing CIGAR string | |
499 my @len = split (/\D+/,$cigar); # storing the length per operation | |
500 my @ops = split (/\d+/,$cigar); # storing the operation | |
501 shift @ops; # remove the empty first element | |
502 die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops); | |
503 | |
504 # warn "CIGAR string; $cigar\n"; | |
505 ### determining end position of the read | |
506 foreach my $index(0..$#len){ | |
507 if ($ops[$index] eq 'M'){ # standard matching bases | |
508 $end += $len[$index]; | |
509 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
510 } | |
511 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
512 # warn "Operation is 'I', next\n"; | |
513 } | |
514 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
515 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
516 $end += $len[$index]; | |
517 } | |
518 else{ | |
519 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
520 } | |
521 } | |
522 } | |
523 | |
524 ### reading in the next line | |
525 $_ = <IN>; | |
526 # the only thing we need is the start position | |
527 ($start) = (split (/\t/))[3]; | |
528 } | |
529 $composite = join (":",$strand,$chr,$start,$end); | |
530 } | |
531 | |
532 else{ | |
533 die "Input must be single or paired-end\n"; | |
534 } | |
535 } | |
536 | |
537 if (exists $unique_seqs{$composite}){ | |
538 ++$removed; | |
539 unless (exists $positions{$composite}){ | |
540 $positions{$composite}++; | |
541 } | |
542 } | |
543 else{ | |
544 if ($paired){ | |
545 unless ($vanilla){ | |
546 print OUT "$line1"; # printing first paired-end line for SAM output | |
547 } | |
548 } | |
549 print OUT "$_"; # printing single-end SAM alignment or second paired-end line | |
550 $unique_seqs{$composite}++; | |
551 } | |
552 } | |
553 | |
554 my $percentage; | |
555 my $percentage_leftover; | |
556 my $leftover = $count - $removed; | |
557 | |
558 unless ($count == 0){ | |
559 $percentage = sprintf("%.2f",$removed/$count*100); | |
560 $percentage_leftover = sprintf("%.2f",$leftover/$count*100); | |
561 } | |
562 else{ | |
563 $percentage = 'N/A'; | |
564 $percentage_leftover = 'N/A'; | |
565 } | |
566 | |
567 warn "\nTotal number of alignments analysed in $file:\t$count\n"; | |
568 warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; | |
569 warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; | |
570 warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; | |
571 | |
572 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; | |
573 print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; | |
574 print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; | |
575 print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; | |
576 } | |
577 | |
578 close OUT or warn "Failed to close output filehandle: $!\n"; | |
579 close REPORT or warn "Failed to close report filehandle: $!\n"; | |
580 | |
581 } | |
582 | |
583 | |
584 sub deduplicate_barcoded_rrbs{ | |
585 | |
586 my $file = shift; | |
587 | |
588 my %unique_seqs; | |
589 my %positions; | |
590 | |
591 if ($file =~ /\.gz$/){ | |
592 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
593 } | |
594 elsif ($file =~ /\.bam$/){ | |
595 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
596 } | |
597 else{ | |
598 open (IN,$file) or die "Unable to read from $file: $!\n"; | |
599 } | |
600 | |
601 my $outfile = $file; | |
602 $outfile =~ s/\.gz$//; | |
603 $outfile =~ s/\.sam$//; | |
604 $outfile =~ s/\.bam$//; | |
605 $outfile =~ s/\.txt$//; | |
606 | |
607 if ($vanilla){ | |
608 $outfile =~ s/$/_dedup_RRBS.txt/; | |
609 } | |
610 else{ | |
611 if ($bam == 1){ | |
612 $outfile =~ s/$/.dedup_RRBS.bam/; | |
613 } | |
614 elsif ($bam == 2){ | |
615 $outfile =~ s/$/.dedupRRBS.sam.gz/; | |
616 } | |
617 else{ | |
618 $outfile =~ s/$/.dedup_RRBS.sam/; | |
619 } | |
620 } | |
621 if ($bam == 1){ | |
622 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
623 } | |
624 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
625 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
626 } | |
627 else{ | |
628 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; | |
629 } | |
630 | |
631 ### This mode only supports Bismark SAM output | |
632 my $count = 0; | |
633 my $unique_seqs = 0; | |
634 my $removed = 0; | |
635 | |
636 while (<IN>){ | |
637 | |
638 if ($count == 0){ | |
639 if ($_ =~ /^Bismark version:/){ | |
640 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; | |
641 sleep (2); | |
642 print_helpfile(); | |
643 exit; | |
644 } | |
645 } | |
646 | |
647 ### if this was a SAM file we ignore header lines | |
648 if (/^\@\w{2}\t/){ | |
649 warn "skipping SAM header line:\t$_"; | |
650 print OUT; # Printing the header lines again into the de-duplicated file | |
651 next; | |
652 } | |
653 | |
654 ++$count; | |
655 my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths | |
656 ### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication | |
657 ### in effect the $composite string looks like this (separated by ':'): | |
658 | |
659 ### FLAG:chromosome:start:barcode | |
660 | |
661 my $end; | |
662 my $line1; | |
663 | |
664 # SAM format | |
665 my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand | |
666 | |
667 $id =~ /:(\w+)$/; | |
668 my $barcode = $1; | |
669 unless ($barcode){ | |
670 die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n"; | |
671 } | |
672 | |
673 ### SAM single-end | |
674 if ($single){ | |
675 | |
676 if ($strand == 0 ){ | |
677 ### read aligned to the forward strand. No action needed | |
678 } | |
679 elsif ($strand == 16){ | |
680 ### read is on reverse strand | |
681 | |
682 $start -= 1; # only need to adjust this once | |
683 | |
684 # for InDel free matches we can simply use the M number in the CIGAR string | |
685 if ($cigar =~ /^(\d+)M$/){ # linear match | |
686 $start += $1; | |
687 } | |
688 else{ | |
689 # parsing CIGAR string | |
690 my @len = split (/\D+/,$cigar); # storing the length per operation | |
691 my @ops = split (/\d+/,$cigar); # storing the operation | |
692 shift @ops; # remove the empty first element | |
693 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
694 | |
695 # warn "CIGAR string; $cigar\n"; | |
696 ### determining end position of a read | |
697 foreach my $index(0..$#len){ | |
698 if ($ops[$index] eq 'M'){ # standard matching bases | |
699 $start += $len[$index]; | |
700 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
701 } | |
702 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
703 # warn "Operation is 'I', next\n"; | |
704 } | |
705 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
706 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
707 $start += $len[$index]; | |
708 } | |
709 else{ | |
710 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
711 } | |
712 } | |
713 } | |
714 } | |
715 | |
716 ### Here we take the barcode sequence into consideration | |
717 $composite = join (":",$strand,$chr,$start,$barcode); | |
718 # warn "$composite\n\n"; | |
719 # sleep(1); | |
720 } | |
721 elsif($paired){ | |
722 | |
723 ### storing the current line | |
724 $line1 = $_; | |
725 | |
726 my $read_conversion; | |
727 my $genome_conversion; | |
728 | |
729 while ( /(XR|XG):Z:([^\t]+)/g ) { | |
730 my $tag = $1; | |
731 my $value = $2; | |
732 | |
733 if ($tag eq "XR") { | |
734 $read_conversion = $value; | |
735 $read_conversion =~ s/\r//; | |
736 chomp $read_conversion; | |
737 } | |
738 elsif ($tag eq "XG") { | |
739 $genome_conversion = $value; | |
740 $genome_conversion =~ s/\r//; | |
741 chomp $genome_conversion; | |
742 } | |
743 } | |
744 die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion); | |
745 | |
746 | |
747 my $index; | |
748 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
749 $index = 0; | |
750 $strand = '+'; | |
751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
752 $index = 1; | |
753 $strand = '-'; | |
754 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
755 $index = 2; | |
756 $strand = '+'; | |
757 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
758 $index = 3; | |
759 $strand = '-'; | |
760 } else { | |
761 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
762 } | |
763 | |
764 # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2 | |
765 if ($index == 0 or $index == 2){ | |
766 | |
767 ### reading in the next line | |
768 $_ = <IN>; | |
769 # the only thing we need is the end position | |
770 ($end,my $cigar_2) = (split (/\t/))[3,5]; | |
771 | |
772 $end -= 1; # only need to adjust this once | |
773 | |
774 # for InDel free matches we can simply use the M number in the CIGAR string | |
775 if ($cigar_2 =~ /^(\d+)M$/){ # linear match | |
776 $end += $1; | |
777 } | |
778 else{ | |
779 # parsing CIGAR string | |
780 my @len = split (/\D+/,$cigar_2); # storing the length per operation | |
781 my @ops = split (/\d+/,$cigar_2); # storing the operation | |
782 shift @ops; # remove the empty first element | |
783 die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops); | |
784 | |
785 # warn "CIGAR string; $cigar_2\n"; | |
786 ### determining end position of the read | |
787 foreach my $index(0..$#len){ | |
788 if ($ops[$index] eq 'M'){ # standard matching bases | |
789 $end += $len[$index]; | |
790 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
791 } | |
792 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
793 # warn "Operation is 'I', next\n"; | |
794 } | |
795 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
796 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
797 $end += $len[$index]; | |
798 } | |
799 else{ | |
800 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
801 } | |
802 } | |
803 } | |
804 } | |
805 else{ | |
806 # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line | |
807 | |
808 $end = $start - 1; # need to adjust this only once | |
809 | |
810 # for InDel free matches we can simply use the M number in the CIGAR string | |
811 if ($cigar =~ /^(\d+)M$/){ # linear match | |
812 $end += $1; | |
813 } | |
814 else{ | |
815 # parsing CIGAR string | |
816 my @len = split (/\D+/,$cigar); # storing the length per operation | |
817 my @ops = split (/\d+/,$cigar); # storing the operation | |
818 shift @ops; # remove the empty first element | |
819 die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops); | |
820 | |
821 # warn "CIGAR string; $cigar\n"; | |
822 ### determining end position of the read | |
823 foreach my $index(0..$#len){ | |
824 if ($ops[$index] eq 'M'){ # standard matching bases | |
825 $end += $len[$index]; | |
826 # warn "Operation is 'M', adding $len[$index] bp\n"; | |
827 } | |
828 elsif($ops[$index] eq 'I'){ # insertions do not affect the end position | |
829 # warn "Operation is 'I', next\n"; | |
830 } | |
831 elsif($ops[$index] eq 'D'){ # deletions do affect the end position | |
832 # warn "Operation is 'D',adding $len[$index] bp\n"; | |
833 $end += $len[$index]; | |
834 } | |
835 else{ | |
836 die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; | |
837 } | |
838 } | |
839 } | |
840 | |
841 ### reading in the next line | |
842 $_ = <IN>; | |
843 # the only thing we need is the start position | |
844 ($start) = (split (/\t/))[3]; | |
845 } | |
846 | |
847 ### Here we take the barcode sequence into consideration | |
848 $composite = join (":",$strand,$chr,$start,$end,$barcode); | |
849 } | |
850 else{ | |
851 die "Input must be single or paired-end\n"; | |
852 } | |
853 | |
854 if (exists $unique_seqs{$composite}){ | |
855 ++$removed; | |
856 unless (exists $positions{$composite}){ | |
857 $positions{$composite}++; | |
858 } | |
859 } | |
860 else{ | |
861 if ($paired){ | |
862 print OUT $line1; # printing first paired-end line for SAM output | |
863 } | |
864 print OUT; # printing single-end SAM alignment or second paired-end line | |
865 $unique_seqs{$composite}++; | |
866 } | |
867 } | |
868 | |
869 my $percentage; | |
870 my $percentage_leftover; | |
871 my $leftover = $count - $removed; | |
872 | |
873 unless ($count == 0){ | |
874 $percentage = sprintf("%.2f",$removed/$count*100); | |
875 $percentage_leftover = sprintf("%.2f",$leftover/$count*100); | |
876 } | |
877 else{ | |
878 $percentage = 'N/A'; | |
879 $percentage_leftover = 'N/A'; | |
880 } | |
881 | |
882 | |
883 warn "\nTotal number of alignments analysed in $file:\t$count\n"; | |
884 warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; | |
885 warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; | |
886 warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; | |
887 | |
888 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; | |
889 print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; | |
890 print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; | |
891 print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; | |
892 | |
893 } | |
894 | |
895 sub bam_isEmpty{ | |
896 | |
897 my $file = shift; | |
898 | |
899 if ($file =~ /\.bam$/){ | |
900 open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n"; | |
901 } | |
902 else{ | |
903 open (EMPTY,$file) or die "Unable to read from $file: $!\n"; | |
904 } | |
905 my $count = 0; | |
906 while (<EMPTY>){ | |
907 if ($_){ | |
908 $count++; # file contains data, fine. | |
909 } | |
910 last; # one line is enough | |
911 } | |
912 | |
913 if ($count == 0){ | |
914 die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n"; | |
915 } | |
916 close EMPTY or warn "$!\n"; | |
917 } | |
918 | |
919 | |
920 sub print_helpfile{ | |
921 print "\n",'='x111,"\n"; | |
922 print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(this is the fastest option, and as the alignments are not ordered in any way this is also near enough random).\n\n"; | |
923 print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n"; | |
924 print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication. "; | |
925 print "This script expects the Bismark output to be in SAM format\n(Bismark v0.6.x or higher). To deduplicate the old custom Bismark output please specify '--vanilla'.\n\n"; | |
926 print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n) ***\n\n"; | |
927 | |
928 print '='x111,"\n\n"; | |
929 print ">>> USAGE: ./deduplicate_bismark_alignment_output.pl [options] filename(s) <<<\n\n"; | |
930 | |
931 print "-s/--single\t\tdeduplicate single-end Bismark files (default format: SAM)\n"; | |
932 print "-p/--paired\t\tdeduplicate paired-end Bismark files (default format: SAM)\n\n"; | |
933 print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n"; | |
934 print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n"; | |
935 print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n"; | |
936 print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n"; | |
937 print "--version\t\tPrint version information and exit\n"; | |
938 | |
939 print '='x111,"\n\n"; | |
940 | |
941 print "This script was last modified on November 04, 2015\n\n"; | |
942 } | |
943 | |
944 | |
945 | |
946 | |
947 sub test_positional_sorting{ | |
948 | |
949 my $filename = shift; | |
950 | |
951 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; | |
952 sleep(1); | |
953 | |
954 if ($filename =~ /\.gz$/) { | |
955 open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
956 } | |
957 elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam | |
958 if ($samtools_path){ | |
959 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | |
960 } | |
961 else{ | |
962 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
963 } | |
964 } | |
965 else { | |
966 open (TEST,$filename) or die "Can't open file $filename: $!\n"; | |
967 } | |
968 | |
969 my $count = 0; | |
970 | |
971 while (<TEST>) { | |
972 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) | |
973 if (/^\@SO/) { | |
974 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n"; | |
975 } | |
976 next; | |
977 } | |
978 $count++; | |
979 | |
980 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID | |
981 | |
982 my ($id_1) = (split (/\t/)); | |
983 | |
984 ### reading the next line which should be read 2 | |
985 $_ = <TEST>; | |
986 my ($id_2) = (split (/\t/)); | |
987 last unless ($id_2); | |
988 ++$count; | |
989 | |
990 if ($id_1 eq $id_2){ | |
991 ### ids are the same | |
992 next; | |
993 } | |
994 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first | |
995 my $id_1_trunc = $id_1; | |
996 $id_1_trunc =~ s/\/1$//; | |
997 my $id_2_trunc = $id_2; | |
998 $id_2_trunc =~ s/\/2$//; | |
999 | |
1000 unless ($id_1_trunc eq $id_2_trunc){ | |
1001 die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n"; | |
1002 } | |
1003 } | |
1004 } | |
1005 # close TEST or die $!; somehow fails on our cluster... | |
1006 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) | |
1007 warn "...passed!\n"; | |
1008 sleep(1); | |
1009 | |
1010 } | |
1011 | |
1012 sub isBam{ | |
1013 | |
1014 my $filename = shift; | |
1015 | |
1016 # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) | |
1017 open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; | |
1018 | |
1019 ### when BAM files read through a gunzip -c stream they start with BAM... | |
1020 my $bam_in_disguise = <DISGUISE>; | |
1021 # warn "BAM in disguise: $bam_in_disguise\n\n"; | |
1022 | |
1023 if ($bam_in_disguise){ | |
1024 if ($bam_in_disguise =~ /^BAM/){ | |
1025 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
1026 return 1; | |
1027 } | |
1028 else{ | |
1029 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
1030 return 0; | |
1031 } | |
1032 } | |
1033 else{ | |
1034 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
1035 return 0; | |
1036 } | |
1037 } | |
1038 | |
1039 | |
1040 | |
1041 | |
1042 | |
1043 ##################################### | |
1044 | |
1045 ### The following subroutine "deduplicate_representative" only works correctly for reads that do not contain any indels (as in only Bowtie1-based alignments). | |
1046 ### Please refrain from using it unless you want to test something out. | |
1047 | |
1048 ##################################### | |
1049 | |
1050 sub deduplicate_representative { | |
1051 my $file = shift; | |
1052 | |
1053 my %positions; | |
1054 my %unique_seqs; | |
1055 | |
1056 my $count = 0; | |
1057 my $unique_seqs = 0; | |
1058 my $removed = 0; | |
1059 | |
1060 ### going through the file first and storing all positions as well as their methylation call strings in a hash | |
1061 if ($file =~ /\.gz$/){ | |
1062 open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
1063 } | |
1064 elsif ($file =~ /\.bam$/){ | |
1065 open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
1066 } | |
1067 else{ | |
1068 open (IN,$file) or die "Unable to read from $file: $!\n"; | |
1069 } | |
1070 | |
1071 if ($single){ | |
1072 | |
1073 my $outfile = $file; | |
1074 $outfile =~ s/\.gz$//; | |
1075 $outfile =~ s/\.sam$//; | |
1076 $outfile =~ s/\.bam$//; | |
1077 $outfile =~ s/\.txt$//; | |
1078 | |
1079 if ($vanilla){ | |
1080 $outfile =~ s/$/.deduplicated_to_representative_sequences.txt/; | |
1081 } | |
1082 else{ | |
1083 if ($bam == 1){ | |
1084 $outfile =~ s/$/.deduplicated_to_representative_sequences.bam/; | |
1085 } | |
1086 elsif ($bam == 2){ | |
1087 $outfile =~ s/$/.deduplicated_to_representative_sequences.sam.gz/; | |
1088 } | |
1089 else{ | |
1090 $outfile =~ s/$/.deduplicated_to_representative_sequences.sam/; | |
1091 } | |
1092 } | |
1093 | |
1094 if ($bam == 1){ | |
1095 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
1096 } | |
1097 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
1098 open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; | |
1099 } | |
1100 else{ | |
1101 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; | |
1102 } | |
1103 | |
1104 warn "Reading and storing all alignment positions\n"; | |
1105 | |
1106 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output | |
1107 if ($vanilla){ | |
1108 $_ = <IN>; # Bismark version header | |
1109 print OUT; # Printing the Bismark version to the de-duplicated file again | |
1110 } | |
1111 | |
1112 while (<IN>){ | |
1113 | |
1114 if ($count == 0){ | |
1115 if ($_ =~ /^Bismark version:/){ | |
1116 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; | |
1117 sleep (2); | |
1118 print_helpfile(); | |
1119 exit; | |
1120 } | |
1121 } | |
1122 | |
1123 ### if this was a SAM file we ignore header lines | |
1124 unless ($vanilla){ | |
1125 if (/^\@\w{2}\t/){ | |
1126 warn "skipping SAM header line:\t$_"; | |
1127 print OUT; # Printing the header lines again into the de-duplicated file | |
1128 next; | |
1129 } | |
1130 } | |
1131 | |
1132 my ($strand,$chr,$start,$end,$meth_call); | |
1133 | |
1134 if ($vanilla){ | |
1135 ($strand,$chr,$start,$end,$meth_call) = (split (/\t/))[1,2,3,4,7]; | |
1136 } | |
1137 else{ # SAM format | |
1138 | |
1139 ($strand,$chr,$start,my $seq,$meth_call) = (split (/\t/))[1,2,3,9,13]; # we are assigning the FLAG value to $strand | |
1140 ### SAM single-end | |
1141 $end = $start + length($seq) - 1; | |
1142 } | |
1143 | |
1144 my $composite = join (":",$strand,$chr,$start,$end); | |
1145 | |
1146 $count++; | |
1147 $positions{$composite}->{$meth_call}->{count}++; | |
1148 $positions{$composite}->{$meth_call}->{alignment} = $_; | |
1149 } | |
1150 warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n"; | |
1151 close IN or warn $!; | |
1152 } | |
1153 | |
1154 elsif ($paired){ | |
1155 | |
1156 ### we are going to concatenate both methylation call strings from the paired end file to form a joint methylation call string | |
1157 | |
1158 my $outfile = $file; | |
1159 if ($vanilla){ | |
1160 $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.txt/; | |
1161 } | |
1162 else{ | |
1163 $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.sam/; | |
1164 } | |
1165 | |
1166 open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; | |
1167 warn "Reading and storing all alignment positions\n"; | |
1168 | |
1169 ### need to proceed slightly differently for the custom Bismark and Bismark SAM output | |
1170 if ($vanilla){ | |
1171 $_ = <IN>; # Bismark version header | |
1172 print OUT; # Printing the Bismark version to the de-duplicated file again | |
1173 } | |
1174 | |
1175 while (<IN>){ | |
1176 | |
1177 if ($count == 0){ | |
1178 if ($_ =~ /^Bismark version:/){ | |
1179 warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; | |
1180 sleep (2); | |
1181 print_helpfile(); | |
1182 exit; | |
1183 } | |
1184 } | |
1185 | |
1186 ### if this was a SAM file we ignore header lines | |
1187 unless ($vanilla){ | |
1188 if (/^\@\w{2}\t/){ | |
1189 warn "skipping SAM header line:\t$_"; | |
1190 print OUT; # Printing the header lines again into the de-duplicated file | |
1191 next; | |
1192 } | |
1193 } | |
1194 | |
1195 my ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2); | |
1196 my $line1; | |
1197 | |
1198 if ($vanilla){ | |
1199 ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2) = (split (/\t/))[1,2,3,4,7,10]; | |
1200 } | |
1201 else{ # SAM paired-end format | |
1202 | |
1203 ($strand,$chr,$start,$meth_call_1) = (split (/\t/))[1,2,3,13]; # we are assigning the FLAG value to $strand | |
1204 | |
1205 ### storing the first line (= read 1 alignment) | |
1206 $line1 = $_; | |
1207 | |
1208 ### reading in the next line | |
1209 $_ = <IN>; | |
1210 # we only need the end position and the methylation call | |
1211 (my $pos,my $seq_2,$meth_call_2) = (split (/\t/))[3,9,13]; | |
1212 $end = $pos + length($seq_2) - 1; | |
1213 } | |
1214 | |
1215 my $composite = join (":",$strand,$chr,$start,$end); | |
1216 $count++; | |
1217 my $meth_call = $meth_call_1.$meth_call_2; | |
1218 | |
1219 $positions{$composite}->{$meth_call}->{count}++; | |
1220 if ($vanilla){ | |
1221 $positions{$composite}->{$meth_call}->{alignment} = $_; | |
1222 } | |
1223 else{ # SAM PAIRED-END | |
1224 $positions{$composite}->{$meth_call}->{alignment_1} = $line1; | |
1225 $positions{$composite}->{$meth_call}->{alignment_2} = $_; | |
1226 } | |
1227 } | |
1228 warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n"; | |
1229 close IN or warn $!; | |
1230 } | |
1231 | |
1232 ### PRINTING RESULTS | |
1233 | |
1234 ### Now going through all stored positions and printing out the methylation call which is most representative, i.e. the one which occurred most often | |
1235 warn "Now printing out alignments with the most representative methylation call(s)\n"; | |
1236 | |
1237 foreach my $pos (keys %positions){ | |
1238 foreach my $meth_call (sort { $positions{$pos}->{$b}->{count} <=> $positions{$pos}->{$a}->{count} }keys %{$positions{$pos}}){ | |
1239 if ($paired){ | |
1240 if ($vanilla){ | |
1241 print OUT $positions{$pos}->{$meth_call}->{alignment}; | |
1242 } | |
1243 else{ | |
1244 print OUT $positions{$pos}->{$meth_call}->{alignment_1}; # SAM read 1 | |
1245 print OUT $positions{$pos}->{$meth_call}->{alignment_2}; # SAM read 2 | |
1246 } | |
1247 } | |
1248 else{ # single-end | |
1249 print OUT $positions{$pos}->{$meth_call}->{alignment}; | |
1250 } | |
1251 $unique_seqs++; | |
1252 last; ### exiting once we printed a sequence with the most frequent methylation call for a position | |
1253 } | |
1254 } | |
1255 | |
1256 my $percentage; | |
1257 unless ($count == 0){ | |
1258 $percentage = sprintf ("%.2f",$unique_seqs*100/$count); | |
1259 } | |
1260 else{ | |
1261 $percentage = 'N/A'; | |
1262 } | |
1263 | |
1264 warn "\nTotal number of alignments analysed in $file:\t$count\n"; | |
1265 warn "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n"; | |
1266 print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; | |
1267 print REPORT "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n"; | |
1268 | |
1269 } | |
1270 | |
1271 | |
1272 |