0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3
|
|
4 ###Builds interpolated pileup from SAM file
|
|
5 ##@description counts bases between paired ends and piles up single end reads.
|
|
6 ##@output, uses a #header for the RNAME and then the number of reads per base
|
|
7 ##@author sm8@sanger.ac.uk, Stephen B. Montgomery
|
|
8
|
|
9 ##@caveats
|
|
10 ##Requires RNAME to have format as per example
|
|
11 ## chromosome:NCBI36:18:1:76117153:1
|
|
12 ## supercontig::NT_113883:1:137703:1
|
|
13 ## clone::AC138827.3:1:149397:1
|
|
14 ##Expects simple CIGAR characters, M, I and D
|
|
15 ##Expects SAM file to be sorted.
|
|
16 ##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77)
|
|
17
|
|
18 ##Verify and read in SAM file
|
|
19 my $sam_file = $ARGV[0];
|
|
20 if(!defined($sam_file)) { die("No sam file defined on arg 1"); }
|
|
21 unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); }
|
|
22 open(SAM, $sam_file) || die("Cannot open sam file");
|
|
23
|
|
24 ##Globals
|
|
25 my $current_location = ""; ##Current RNAME being processed
|
|
26 my $current_size = 0; ##Size of sequence region being processed
|
|
27 my $current_position = 1; ##Current base being processed
|
|
28 my $open = 0; ##Number of open reads (PE reads that have not been closed)
|
|
29 my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the
|
|
30 ##contained value from those open and deletes the indexed position from the hash
|
|
31
|
|
32 while (my $line = <SAM>) {
|
|
33 my @tokens = split /\t/, $line;
|
|
34
|
|
35 if ($current_location ne $tokens[2]) { ##Start a new sequence region
|
|
36 for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region
|
|
37 if (defined($close{$i})) {
|
|
38 $open = $open - $close{$i};
|
|
39 delete $close{$i};
|
|
40 }
|
|
41 print $open . "\n";
|
|
42 }
|
|
43 if ($current_location ne "") {
|
|
44 print "\n";
|
|
45 }
|
|
46
|
|
47 ##Initiate a new sequence region
|
|
48 my @location_tokens = split /:/, $tokens[2];
|
|
49 $current_position = 1;
|
|
50 $current_location = $tokens[2];
|
|
51 $current_size = $location_tokens[4];
|
|
52 $open = 0;
|
|
53 %close = ();
|
|
54 print "#" . $tokens[2] . "\n";
|
|
55
|
|
56 ##Print pileup to just before the first read (will be 0)
|
|
57 for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) {
|
|
58 print $open . "\n";
|
|
59 }
|
|
60 $current_position = $tokens[3];
|
|
61
|
|
62 } else { ##Sequence region already open
|
|
63 if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position
|
|
64 ##cycle through to catch up to the current position
|
|
65 for (my $i = $current_position; $i < $tokens[3]; $i++) {
|
|
66 if (defined($close{$i})) {
|
|
67 $open = $open - $close{$i};
|
|
68 delete $close{$i};
|
|
69 }
|
|
70 print $open . "\n";
|
|
71 }
|
|
72 $current_position = $tokens[3];
|
|
73 }
|
|
74 }
|
|
75 $open++; ##Increment the number of open reads
|
|
76
|
|
77 if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition
|
|
78 $open--;
|
|
79 my $parsed_cig = &parseCigar($tokens[5]);
|
|
80 my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
|
|
81 if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
|
|
82 $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
|
|
83 } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition
|
|
84 my $parsed_cig = &parseCigar($tokens[5]);
|
|
85 my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
|
|
86 if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
|
|
87 $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
|
|
88 } else {
|
|
89 #do nothing
|
|
90 }
|
|
91 }
|
|
92 for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region
|
|
93 if (defined($close{$i})) {
|
|
94 $open = $open - $close{$i};
|
|
95 delete $close{$i};
|
|
96 }
|
|
97 print $open . "\n";
|
|
98 }
|
|
99 print "\n";
|
|
100 close(SAM);
|
|
101 exit(0);
|
|
102
|
|
103 ##reads and tokenizes simple cigarline
|
|
104 sub parseCigar() {
|
|
105 my $cigar_line = shift;
|
|
106 $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g;
|
|
107 my @cigar_tokens = split /\t/, $cigar_line;
|
|
108 my %parsed = ('M' => 0,
|
|
109 'I' => 0,
|
|
110 'D' => 0);
|
|
111 my @events = ();
|
|
112 for(my $i = 0; $i < scalar(@cigar_tokens); $i++) {
|
|
113 if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) {
|
|
114 if (!defined($parsed{$2})) { $parsed{$2} = 0; }
|
|
115 my $nt = $2;
|
|
116 if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; }
|
|
117 $parsed{$nt} += $1;
|
|
118 my %event_el = ("t" => $nt,
|
|
119 "n" => $1);
|
|
120 push @events, \%event_el;
|
|
121 }
|
|
122 }
|
|
123 $parsed{'events'} = \@events;
|
|
124 return \%parsed;
|
|
125 }
|