Mercurial > repos > jjohnson > crest
comparison bam2html.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use Carp; | |
4 use Getopt::Long; | |
5 use English; | |
6 use Pod::Usage; | |
7 use Data::Dumper; | |
8 use Bio::Assembly::IO; | |
9 use Bio::DB::Sam; | |
10 use Bio::DB::Sam::Constants; | |
11 use File::Spec; | |
12 use File::Path; | |
13 use File::Copy; | |
14 use File::Basename; | |
15 | |
16 use constant HTML_COLOR => { | |
17 'MATCH' => '#000000', #black | |
18 'MISMATCH' => '#800000', #brown | |
19 'LOWQUAL' => '#C0C0C0', #grey | |
20 'INDEL' => '#FF8040', #Orange | |
21 'SCLIP' => '#6960EC', #Slate Blue2 | |
22 }; | |
23 my $lowqual_cutoff = 20; | |
24 | |
25 my ($bam_d, $bam_g, $file); | |
26 my $ref_genome; | |
27 my $range; | |
28 | |
29 my ( $help, $man, $version, $usage ); | |
30 my $output; | |
31 my $RNASeq; | |
32 my $optionOK = GetOptions( | |
33 'd=s' => \$bam_d, | |
34 'g=s' => \$bam_g, | |
35 'f|i|file|input=s' => \$file, | |
36 'r|ref_genome=s' => \$ref_genome, | |
37 'range=s' => \$range, | |
38 'RNASeq' => \$RNASeq, | |
39 'o|output=s' => \$output, | |
40 'h|help|?' => \$help, | |
41 'man' => \$man, | |
42 'usage' => \$usage, | |
43 'v|version' => \$version, | |
44 ); | |
45 | |
46 pod2usage(-verbose=>2) if($man or $usage); | |
47 pod2usage(1) if($help or $version ); | |
48 croak "you must specify at least one bam file" unless ($bam_d || $bam_g); | |
49 warn "Missing reference genome, the view is not correct" unless($ref_genome); | |
50 croak "You need either provide an input file or a range" unless ($file || $range); | |
51 | |
52 my @bams; | |
53 push @bams, Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome) if($bam_d); | |
54 push @bams, Bio::DB::Sam->new( -bam => $bam_g, -fasta => $ref_genome) if($bam_g); | |
55 open STDOUT, ">$output" if $output; | |
56 | |
57 if($range) { | |
58 print STDOUT bam2html($range, @bams); | |
59 close STDOUT if $output; | |
60 exit(0); | |
61 } | |
62 | |
63 open my $FILE, "<$file" or croak "can't open $file:$OS_ERROR"; | |
64 while( my $line = <$FILE>) { | |
65 chomp $line; | |
66 my @fields = split /\t/, $line; | |
67 print "<h3 style=\"color:blue;text-align:center\">$line</h3>\n"; | |
68 if(scalar(@fields) == 3) { # deal with file: chr start end format | |
69 my ($chr, $s, $e) = @fields; | |
70 print STDOUT bam2html("$chr:$s-$e", @bams); | |
71 next; | |
72 } | |
73 # deal with CREST output file | |
74 # if($fields[0] =~ /^chr/) { | |
75 # $fields[0] = substr($fields[0], 3); | |
76 # $fields[2] = substr($fields[2], 3); | |
77 # } | |
78 my ($chr, $s, $e) = ($fields[0], $fields[1], $fields[1]); | |
79 print STDOUT bam2html("$chr:$s-$e", @bams); | |
80 ($chr, $s, $e) = ($fields[4], $fields[5], $fields[5]); | |
81 print STDOUT bam2html("$chr:$s-$e", @bams); | |
82 } | |
83 close STDOUT if($output); | |
84 exit(0); | |
85 #print bam2html("12:11676858-11676858", $bam1, $bam2); | |
86 | |
87 | |
88 sub get_input_bam { | |
89 my ($raw_bam_dir, $sample, $group) = @_; | |
90 | |
91 $raw_bam_dir = File::Spec->catdir($raw_bam_dir, $group); | |
92 opendir(my $dh, $raw_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR"; | |
93 my @files = grep { /^$sample-.*bam$/ } readdir($dh); | |
94 close $dh; | |
95 return $files[0]; | |
96 } | |
97 | |
98 | |
99 # this function returns a <pre> </pre> block for the specific contig in the ace file | |
100 sub bam2html { | |
101 my ($range, @bams) = @_; | |
102 my ($chr, $start, $end) = $range =~ m/^(.*?):(\d+)-(\d+)/; | |
103 if(!$chr or !$start or !$end) { | |
104 croak "The range format is not correct, use chr:start-end"; | |
105 } | |
106 my ($r_start, $r_end) = ($start, $end); | |
107 my $name_len = length($range); | |
108 for my $bam (@bams) { | |
109 my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end); | |
110 my @alignments = $segment->features; | |
111 # return "There are too many reads in this region, this is must be a highly repetitive region, abort!" | |
112 # if(scalar(@alignments) > 500); | |
113 for my $a(@alignments) { | |
114 my $l = length($a->query->name); | |
115 $name_len = $l if($name_len < $l); | |
116 } | |
117 } | |
118 if($start == $end) { # we want to check a specific point | |
119 if($RNASeq) { # for RNASeq we just extend a little bit instead of dynamic do it | |
120 $r_start = $start - 100; | |
121 $r_end = $end + 100; | |
122 } | |
123 else { | |
124 for my $bam (@bams) { | |
125 my $segment = $bam->segment(-seq_id => $chr, -start => $start - 1, -end => $end + 1); | |
126 my @alignments = $segment->features; | |
127 for my $a (@alignments) { | |
128 next if($a->start > $end || $a->end < $start); | |
129 $r_start = $a->start if $a->start < $r_start; | |
130 $r_end = $a->end if $a->end > $r_end; | |
131 } | |
132 } | |
133 } | |
134 $range = "$chr:$r_start-$r_end"; | |
135 } | |
136 my $rtn_str = "<pre>\n"; | |
137 my($ref, $pos2padded) = get_padded_ref($range, @bams); | |
138 $rtn_str .= print_bam_ruler($r_start, $r_end, $pos2padded, $name_len) . "\n"; | |
139 my $line = print_ref($ref, $range, $name_len); | |
140 my $line_len = length($line); | |
141 $rtn_str .= $line . "\n" . ' ' x $line_len . "\n"; | |
142 $ref =~ s/\*//g; #remove * from ref sequence | |
143 for my $bam (@bams) { | |
144 my $segment = $bam->segment(-seq_id => $chr, -start => $start, -end => $end); | |
145 my @alignments = $segment->features; | |
146 for my $a (@alignments) { | |
147 #next if($a->cigar_str !~ m/S/); | |
148 if($a->strand == 1) { | |
149 my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len); | |
150 $rtn_str .= $tmp_str . "\n" if($tmp_str); | |
151 } | |
152 } | |
153 for my $a (@alignments) { | |
154 #next if($a->cigar_str !~ m/S/); | |
155 if($a->strand == -1) { | |
156 my $tmp_str = print_bam_seq($a, $r_start, $r_end, $ref, $pos2padded, $name_len); | |
157 $rtn_str .= $tmp_str . "\n" if($tmp_str); | |
158 } | |
159 } | |
160 $rtn_str .= '_' x $line_len . "\n"; | |
161 } | |
162 $rtn_str .= "\n</pre>"; | |
163 return $rtn_str; | |
164 } | |
165 | |
166 # only print the unpadded position, which is meaningful | |
167 sub print_bam_ruler { | |
168 my($start, $end, $pos2padded, $name_len) = @_; | |
169 my ($string, $mark); | |
170 $string = $mark = " "x($name_len + 2); | |
171 | |
172 #print | at 20 and . at 10 | |
173 for( my $i = $start; $i <= $end; $i++) { | |
174 $mark .= $i % 10 == 0 ? ($i % 20 == 0 ? "|" : ".") : " "; | |
175 last if($i == $end); | |
176 $mark .= ' ' x ($pos2padded->[$i - $start + 2] - $pos2padded->[$i - $start + 1] - 1); | |
177 } | |
178 # print numbers above | for padded consensus | |
179 my $padded_i = 0; | |
180 for( my $i = 1; $i <= $end - $start + 1; $i++ ) { | |
181 my $j = $i + $start - 1; | |
182 my $overhead = $pos2padded->[$i] - $padded_i; | |
183 if($overhead > 0 ) { | |
184 $string .= ' ' x $overhead; | |
185 $padded_i += $overhead; | |
186 } | |
187 if( $j % 20 == 0) { | |
188 my $l = length($j); | |
189 my $half_l = int(($l+1)/2); | |
190 $string = substr($string, 0, length($string) - $half_l ); | |
191 $string .= $j; | |
192 $padded_i += ($l - $half_l); | |
193 } | |
194 } | |
195 return join("\n", ($string, $mark)); | |
196 } | |
197 | |
198 sub print_ref { | |
199 my( $ref, $chr, $name_len) = @_; | |
200 return $chr . " " x ($name_len + 2 - length($chr)) . $ref; | |
201 } | |
202 | |
203 sub get_padded_ref { | |
204 my ($range, @bams) = @_; | |
205 my $ref_str; #padded ref genome | |
206 my @pos2padded; | |
207 my($chr, $start, $end) = $range =~ m/(.*?):(\d+)-(\d+)/; | |
208 | |
209 @pos2padded = 0 .. ($end - $start + 1); | |
210 push @pos2padded, 100 + $end - $start; | |
211 | |
212 foreach my $bam (@bams) { | |
213 my ($d, $cum_pad) = (0, 0); | |
214 my $padded_fun = sub { | |
215 my ($seqid, $pos, $p) = @_; | |
216 return if($pos < $start || $pos > $end); | |
217 $d++; | |
218 $pos2padded[$d] += $cum_pad; | |
219 my $max_ins = 0; | |
220 for my $pileup (@$p) { | |
221 $max_ins = $pileup->indel if($max_ins < $pileup->indel); | |
222 } | |
223 if($max_ins > $pos2padded[$d+1] + $cum_pad - $pos2padded[$d] - 1) { | |
224 $cum_pad += $max_ins - ($pos2padded[$d+1] + $cum_pad- $pos2padded[$d] - 1); | |
225 } | |
226 }; | |
227 $bam->pileup($range, $padded_fun); | |
228 if($d <= $end - $start + 1) { | |
229 for(my $i = $d + 1; $i <= $end-$start+1; $i++) { | |
230 $pos2padded[$i] += $cum_pad; | |
231 } | |
232 } | |
233 } | |
234 my $refseq = $bams[0]->segment($chr, $start, $end)->dna; | |
235 my @str = split //, "*" x $pos2padded[$end - $start + 1]; | |
236 for my $i ( 1 .. ($end - $start + 1)) { | |
237 $str[$pos2padded[$i]] = substr($refseq, $i-1, 1); | |
238 } | |
239 shift @str; #remove the 0th base | |
240 return (join( '', @str), \@pos2padded ); | |
241 } | |
242 | |
243 sub print_bam_seq { # it's complicated | |
244 my ($align, $start, $end, $ref, $pos2padded, $name_len) = @_; | |
245 my $rtn = $align->query->name; | |
246 $rtn .= ' ' x ($name_len - length($rtn)); | |
247 $rtn .= $align->strand == -1 ? '- ' : '+ '; | |
248 my $p_s = $align->start; | |
249 my $p_e = $align->end; | |
250 my $seq = $align->query->dna; | |
251 my @qual = $align->qscore; | |
252 my $repeat = 0; | |
253 if($align->has_tag("XT")) { | |
254 $repeat = 1 if($align->aux_get("XT") ne "U"); | |
255 } | |
256 | |
257 my $r_cigar = $align->cigar_array; | |
258 my ($s, $e) = ($align->query->start, $align->query->end); | |
259 | |
260 # reads partial in the region | |
261 my $leading = ""; | |
262 if($p_s < $start) { | |
263 ($r_cigar, $p_s, $s) = find_new_start($r_cigar, $p_s, $s, $start); | |
264 } | |
265 | |
266 my $tailing = ""; | |
267 if($p_e > $end) { | |
268 ($r_cigar, $p_e, $e) = find_new_end($r_cigar, $p_e, $e, $end); | |
269 } | |
270 return if($s >= $e); | |
271 my @cigar = @{$r_cigar}; | |
272 | |
273 # deal with leading softclip | |
274 my $op = shift @cigar; | |
275 my $l = $pos2padded->[$p_s - $start + 1] - $pos2padded->[1]; | |
276 if($op->[0] eq 'S') { | |
277 my $ss = $op->[1] - $l; | |
278 if($ss < 0) { | |
279 $ss = -$ss; | |
280 $leading .= ' ' x $ss; | |
281 $ss = 0; | |
282 } | |
283 $leading .= '<font color="' . HTML_COLOR->{'SCLIP'} . '">'; | |
284 for( my $i = $ss; $i < $op->[1]; $i++) { | |
285 my $c = substr($seq, $i, 1); | |
286 $c = lc $c if($qual[$i] < $lowqual_cutoff); | |
287 $leading .= $c; | |
288 } | |
289 $leading .= '</font>' ; | |
290 } | |
291 else { | |
292 $leading .= ' ' x $l; | |
293 unshift @cigar, $op; | |
294 } | |
295 | |
296 #dealing with tailing softclip | |
297 $op = pop @cigar; | |
298 $l = $pos2padded->[$end - $start + 1] - $pos2padded->[$p_e - $start + 1]; | |
299 if($op->[0] eq 'S') { | |
300 my $ee = $op->[1] - $l; | |
301 $ee = 0 if($ee < 0 ); | |
302 $tailing = '<font color="' . HTML_COLOR->{'SCLIP'} . '">'; | |
303 for( my $i = 0; $i < $op->[1] - $ee; $i++){ | |
304 my $c = substr($seq, $i + $e, 1); | |
305 $c = lc $c if($qual[$i + $e] < $lowqual_cutoff); | |
306 $tailing .= $c; | |
307 } | |
308 $tailing .= '</font>'; | |
309 } | |
310 else{ | |
311 push @cigar, $op; | |
312 } | |
313 | |
314 # generate the alignment part, only M, I, D and N | |
315 my $mid = ''; | |
316 my $mode; | |
317 foreach $op (@cigar) { | |
318 my $l = $op->[1]; | |
319 if($op->[0] eq 'M') { | |
320 my $line = ''; | |
321 while($l > 0 ){ | |
322 $l--; | |
323 my $c = substr($seq, $s - 1, 1); #seq is 0 based | |
324 my $newmode = "MISMATCH"; | |
325 my $cc = substr($ref, $p_s - $start, 1); | |
326 #last unless ($c && $cc); | |
327 $newmode = "MATCH" if($cc eq $c); | |
328 if($qual[$s-1] < $lowqual_cutoff) { | |
329 $c = lc $c; | |
330 $newmode = 'LOWQUAL' if($newmode eq 'MATCH'); | |
331 } | |
332 if(!$mode) { | |
333 $line .= '<font color="' . HTML_COLOR->{$newmode} . '">'; | |
334 } | |
335 elsif($mode ne $newmode) { | |
336 $line .= '</font>' . '<font color="' . HTML_COLOR->{$newmode} . '">'; | |
337 } | |
338 $mode = $newmode; | |
339 $line .= $c; | |
340 $s++; $p_s++; | |
341 # dealing with padded * in reference genome | |
342 if($p_s < $p_e && $s < $e && $l > 0) { | |
343 my $tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s-1 - $start + 1]; | |
344 if( $tmp > 1) { | |
345 $line .= '</font>'; | |
346 $line .= '<font color="' . HTML_COLOR->{'INDEL'} . '">'; | |
347 $line .= '*' x ($tmp - 1); | |
348 $mode = 'INDEL'; | |
349 } | |
350 } | |
351 } | |
352 $mid .= $line; | |
353 } | |
354 if($op->[0] eq 'D' || $op->[0] eq 'I' || $op->[0] eq 'N') { | |
355 my $newmode = 'INDEL'; | |
356 my $tmp; #extra padded * after indel | |
357 $mid .= '</font>' if($mode && $mode ne $newmode); | |
358 $mid .= '<font color="' . HTML_COLOR->{'INDEL'} . '">'; | |
359 | |
360 if($op->[0] eq 'D' || $op->[0] eq 'N') { | |
361 $mid .= ($op->[0] eq 'D' ? '*' : '=') x $l; | |
362 $p_s += $l; | |
363 $tmp = $pos2padded->[$p_s - $start + 1]-$pos2padded->[$p_s - $l - $start ] - $l if($p_s < $p_e); | |
364 } | |
365 else{ | |
366 $tmp = $pos2padded->[$p_s - $start + 1] - $pos2padded->[$p_s - 1 - $start + 1] - $l if($p_s < $p_e); | |
367 while($l > 0 ) { | |
368 my $c = substr($seq, $s - 1, 1); | |
369 $c = lc $c if($qual[$s - 1] < $lowqual_cutoff); | |
370 $mid .= $c; | |
371 $l--; $s++; | |
372 } | |
373 } | |
374 $mode = $newmode; | |
375 if($p_s < $p_e && $tmp > 1) { | |
376 $mid .= '*' x ($tmp - 1); | |
377 } | |
378 } | |
379 } | |
380 $mid .= '</font>'; | |
381 $rtn .= $leading . $mid . $tailing; | |
382 $rtn = '<b><i>' . $rtn . '</i></b>' if($repeat); | |
383 return $rtn; | |
384 } | |
385 | |
386 sub find_new_start { | |
387 my ($r_cigar, $p_s, $s, $start) = @_; | |
388 my @cigar = @{$r_cigar}; | |
389 | |
390 while(1) { | |
391 my $op = shift @cigar; | |
392 next if( $op->[0] eq 'S' || $op->[0] eq 'H'); | |
393 if( $op->[0] eq 'I') { | |
394 $s += $op->[1]; next; | |
395 } | |
396 if( $p_s + $op->[1] < $start ) { | |
397 $p_s += $op->[1]; | |
398 $s += $op->[1] if $op->[0] eq 'M'; | |
399 } | |
400 else { | |
401 $s += ($start - $p_s) if $op->[0] eq 'M'; | |
402 unshift @cigar, [$op->[0], $op->[1] - ($start - $p_s)]; | |
403 $p_s = $start; | |
404 return (\@cigar, $p_s, $s); | |
405 } | |
406 } | |
407 } | |
408 | |
409 sub find_new_end { | |
410 my ($r_cigar, $p_e, $e, $end) = @_; | |
411 my @cigar = @{$r_cigar}; | |
412 | |
413 while(1) { | |
414 my $op = pop @cigar; | |
415 next if( $op->[0] eq 'S' || $op->[0] eq 'H'); | |
416 if( $op->[0] eq 'I') { | |
417 $e -= $op->[1]; next; | |
418 } | |
419 if( $p_e - $op->[1] > $end ) { | |
420 $p_e -= $op->[1]; | |
421 $e -= $op->[1] if $op->[0] eq 'M'; | |
422 } | |
423 else { | |
424 $e -= ($p_e - $end) if $op->[0] eq 'M'; | |
425 push @cigar, [$op->[0], $op->[1] - ($p_e - $end)]; | |
426 $p_e = $end; | |
427 return (\@cigar, $p_e, $e); | |
428 } | |
429 } | |
430 } | |
431 | |
432 =head1 NAME | |
433 | |
434 bam2html.pl - a bam file viewer that just simple display part of the alignment as HTML file. | |
435 | |
436 | |
437 =head1 VERSION | |
438 | |
439 This documentation refers to bam2html.pl version 0.0.1. | |
440 | |
441 | |
442 =head1 USAGE | |
443 | |
444 Display part of a bam file: | |
445 bam2html.pl -r hg18.fa -d diag.bam --range 1:123566-123766 -o diag.html | |
446 Display part of two bam files, one diagnositc, one germlie for comparison. | |
447 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam --range 1:123566-123766 -o diag.html | |
448 Display part of two bam files, one diagnositc, one germlie for comparison from | |
449 a list of positions in a file, each line should be tab sepearted as: chr, start, and end. | |
450 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o diag.html -f position.txt | |
451 Display part of two bam files, one diagnositc, one germlie for comparison from a file | |
452 generated by CREST.pl. | |
453 bam2html.pl -r hg18.fa -d diag.bam -g germline.bam -o predSV.html -f predSV.txt | |
454 | |
455 =head1 REQUIRED ARGUMENTS | |
456 | |
457 To run the program, several parameter must specified. | |
458 -d: The input (diagnositic) bam file | |
459 -g: The input (germ line) bam file | |
460 -r, --ref_genome: The reference genome file in fa format | |
461 | |
462 =head1 OPTIONS | |
463 | |
464 The options that can be used for the program. | |
465 -o: The output file, default to STDOUT if missing. | |
466 --range: The range where SV will be detected, using chr1:100-200 format. | |
467 -f, -i: The input file from either CREST.pl or tab seperated chr, start, end. | |
468 | |
469 -h, --help Help information | |
470 --man Man page. | |
471 --usage Usage information. | |
472 --version Software version. | |
473 | |
474 | |
475 =head1 DESCRIPTION | |
476 | |
477 This is a bam file viewer that just simple display part of the alignment as | |
478 HTML file. The program is developed to view Structure Varitions around break | |
479 point. So manual review will a breeze. Any way this program can be used | |
480 otherway, but don't put a too big range to display as the view will not | |
481 be pretty as each read will occupy a line. | |
482 | |
483 | |
484 =head1 DIAGNOSTICS | |
485 | |
486 If the program does not respond for minutes, please be a little bit patient as | |
487 sometimes generating the html file could take longer time. | |
488 If you provide a range as "chr1:50-100" and nothing was output, please make sure | |
489 the reference genomes for mapping and display are exact the same and the chrom | |
490 name is chr1 not 1. | |
491 | |
492 | |
493 =head1 DEPENDENCIES | |
494 | |
495 The program depend on several packages: | |
496 1. Bioperl perl module. | |
497 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed. | |
498 | |
499 | |
500 =head1 BUGS AND LIMITATIONS | |
501 | |
502 There are no known bugs in this module, but the method is limitted to bam file | |
503 that has soft-clipping cigar string generated.Please report problems to | |
504 Jianmin Wang (Jianmin.Wang@stjude.org) | |
505 Patches are welcome. | |
506 | |
507 =head1 AUTHOR | |
508 | |
509 Jianmin Wang (Jianmin.Wang@stjude.org) | |
510 | |
511 | |
512 | |
513 =head1 LICENCE AND COPYRIGHT | |
514 | |
515 Copyright (c) 2010 by St. Jude Children's Research Hospital. | |
516 | |
517 This program is free software: you can redistribute it and/or modify | |
518 it under the terms of the GNU General Public License as published by | |
519 the Free Software Foundation, either version 2 of the License, or | |
520 (at your option) any later version. | |
521 | |
522 This program is distributed in the hope that it will be useful, | |
523 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
524 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
525 GNU General Public License for more details. | |
526 | |
527 You should have received a copy of the GNU General Public License | |
528 along with this program. If not, see <http://www.gnu.org/licenses/>. |