| 0 | 1 #!/usr/bin/perl -w | 
|  | 2 | 
|  | 3 # Author: lh3 | 
|  | 4 | 
|  | 5 use strict; | 
|  | 6 use warnings; | 
|  | 7 use Getopt::Std; | 
|  | 8 | 
|  | 9 my $version = '0.3.3'; | 
|  | 10 &usage if (@ARGV < 1); | 
|  | 11 | 
|  | 12 my $command = shift(@ARGV); | 
|  | 13 my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf, | 
|  | 14 			unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq); | 
|  | 15 | 
|  | 16 die("Unknown command \"$command\".\n") if (!defined($func{$command})); | 
|  | 17 &{$func{$command}}; | 
|  | 18 exit(0); | 
|  | 19 | 
|  | 20 # | 
|  | 21 # showALEN | 
|  | 22 # | 
|  | 23 | 
|  | 24 sub showALEN { | 
|  | 25   die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN); | 
|  | 26   while (<>) { | 
|  | 27 	my @t = split; | 
|  | 28 	next if (/^\@/ || @t < 11); | 
|  | 29 	my $l = 0; | 
|  | 30 	$_ = $t[5]; | 
|  | 31 	s/(\d+)[MI]/$l+=$1/eg; | 
|  | 32 	print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n"; | 
|  | 33   } | 
|  | 34 } | 
|  | 35 | 
|  | 36 # | 
|  | 37 # varFilter | 
|  | 38 # | 
|  | 39 | 
|  | 40 # | 
|  | 41 # Filtration code: | 
|  | 42 # | 
|  | 43 # d low depth | 
|  | 44 # D high depth | 
|  | 45 # W too many SNPs in a window (SNP only) | 
|  | 46 # G close to a high-quality indel (SNP only) | 
|  | 47 # Q low RMS mapping quality (SNP only) | 
|  | 48 # g close to another indel with higher quality (indel only) | 
|  | 49 # s low SNP quality (SNP only) | 
|  | 50 # i low indel quality (indel only) | 
|  | 51 | 
|  | 52 sub varFilter { | 
|  | 53   my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>''); | 
|  | 54   getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts); | 
|  | 55   die(qq/ | 
|  | 56 Usage:   samtools.pl varFilter [options] <in.cns-pileup> | 
|  | 57 | 
|  | 58 Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}] | 
|  | 59          -q INT    minimum RMS mapping quality for gaps [$opts{q}] | 
|  | 60          -d INT    minimum read depth [$opts{d}] | 
|  | 61          -D INT    maximum read depth [$opts{D}] | 
|  | 62          -S INT    minimum SNP quality [$opts{S}] | 
|  | 63          -i INT    minimum indel quality [$opts{i}] | 
|  | 64 | 
|  | 65          -G INT    min indel score for nearby SNP filtering [$opts{G}] | 
|  | 66          -w INT    SNP within INT bp around a gap to be filtered [$opts{w}] | 
|  | 67 | 
|  | 68          -W INT    window size for filtering dense SNPs [$opts{W}] | 
|  | 69          -N INT    max number of SNPs in a window [$opts{N}] | 
|  | 70 | 
|  | 71          -l INT    window size for filtering adjacent gaps [$opts{l}] | 
|  | 72 | 
|  | 73          -p        print filtered variants | 
|  | 74 \n/) if (@ARGV == 0 && -t STDIN); | 
|  | 75 | 
|  | 76   # calculate the window size | 
|  | 77   my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W}); | 
|  | 78   my $max_dist = $ol > $ow? $ol : $ow; | 
|  | 79   $max_dist = $oW if ($max_dist < $oW); | 
|  | 80   # the core loop | 
|  | 81   my @staging; # (indel_filtering_score, flt_tag) | 
|  | 82   while (<>) { | 
|  | 83 	my @t = split; | 
|  | 84 	next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites | 
|  | 85 	# clear the out-of-range elements | 
|  | 86 	while (@staging) { | 
|  | 87       # Still on the same chromosome and the first element's window still affects this position? | 
|  | 88 	  last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); | 
|  | 89 	  varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much | 
|  | 90 	} | 
|  | 91 	my ($flt, $score) = (0, -1); | 
|  | 92 	# first a simple filter | 
|  | 93 	if ($t[7] < $opts{d}) { | 
|  | 94 	  $flt = 2; | 
|  | 95 	} elsif ($t[7] > $opts{D}) { | 
|  | 96 	  $flt = 3; | 
|  | 97 	} | 
|  | 98     if ($t[2] eq '*') { # an indel | 
|  | 99         if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; } | 
|  | 100     } | 
|  | 101     elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; }    # SNP | 
|  | 102 | 
|  | 103 	# site dependent filters | 
|  | 104     my $len=0; | 
|  | 105 	if ($flt == 0) { | 
|  | 106 	  if ($t[2] eq '*') { # an indel | 
|  | 107         # If deletion, remember the length of the deletion | 
|  | 108         my ($a,$b) = split(m{/},$t[3]); | 
|  | 109         my $alen = length($a) - 1; | 
|  | 110         my $blen = length($b) - 1; | 
|  | 111         if ( $alen>$blen ) | 
|  | 112         { | 
|  | 113             if ( substr($a,0,1) eq '-' ) { $len=$alen; } | 
|  | 114         } | 
|  | 115         elsif ( substr($b,0,1) eq '-' ) { $len=$blen; } | 
|  | 116 | 
|  | 117 		$flt = 1 if ($t[6] < $opts{q}); | 
|  | 118 		# filtering SNPs | 
|  | 119 		if ($t[5] >= $opts{G}) { | 
|  | 120 		  for my $x (@staging) { | 
|  | 121             # Is it a SNP and is it outside the SNP filter window? | 
|  | 122 			next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]); | 
|  | 123 			$x->[1] = 5 if ($x->[1] == 0); | 
|  | 124 		  } | 
|  | 125 		} | 
|  | 126 		# calculate the filtering score (different from indel quality) | 
|  | 127 		$score = $t[5]; | 
|  | 128 		$score += $opts{s} * $t[10] if ($t[8] ne '*'); | 
|  | 129 		$score += $opts{s} * $t[11] if ($t[9] ne '*'); | 
|  | 130 		# check the staging list for indel filtering | 
|  | 131 		for my $x (@staging) { | 
|  | 132           # Is it a SNP and is it outside the gap filter window | 
|  | 133 		  next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]); | 
|  | 134 		  if ($x->[0] < $score) { | 
|  | 135 			$x->[1] = 6; | 
|  | 136 		  } else { | 
|  | 137 			$flt = 6; last; | 
|  | 138 		  } | 
|  | 139 		} | 
|  | 140 	  } else { # a SNP | 
|  | 141 		$flt = 1 if ($t[6] < $opts{Q}); | 
|  | 142 		# check adjacent SNPs | 
|  | 143 		my $k = 1; | 
|  | 144 		for my $x (@staging) { | 
|  | 145 		  ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); | 
|  | 146 		} | 
|  | 147 		# filtering is necessary | 
|  | 148 		if ($k > $opts{N}) { | 
|  | 149 		  $flt = 4; | 
|  | 150 		  for my $x (@staging) { | 
|  | 151 			 $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0); | 
|  | 152 		  } | 
|  | 153 		} else { # then check gap filter | 
|  | 154 		  for my $x (@staging) { | 
|  | 155 			next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]); | 
|  | 156 			if ($x->[0] >= $opts{G}) { | 
|  | 157 			  $flt = 5; last; | 
|  | 158 			} | 
|  | 159 		  } | 
|  | 160 		} | 
|  | 161 	  } | 
|  | 162 	} | 
|  | 163 	push(@staging, [$score, $flt, $len, @t]); | 
|  | 164   } | 
|  | 165   # output the last few elements in the staging list | 
|  | 166   while (@staging) { | 
|  | 167 	varFilter_aux(shift @staging, $opts{p}); | 
|  | 168   } | 
|  | 169 } | 
|  | 170 | 
|  | 171 sub varFilter_aux { | 
|  | 172   my ($first, $is_print) = @_; | 
|  | 173   if ($first->[1] == 0) { | 
|  | 174 	print join("\t", @$first[3 .. @$first-1]), "\n"; | 
|  | 175   } elsif ($is_print) { | 
|  | 176 	print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; | 
|  | 177   } | 
|  | 178 } | 
|  | 179 | 
|  | 180 # | 
|  | 181 # pileup2fq | 
|  | 182 # | 
|  | 183 | 
|  | 184 sub pileup2fq { | 
|  | 185   my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10); | 
|  | 186   getopts('d:D:Q:G:l:', \%opts); | 
|  | 187   die(qq/ | 
|  | 188 Usage:   samtools.pl pileup2fq [options] <in.cns-pileup> | 
|  | 189 | 
|  | 190 Options: -d INT    minimum depth        [$opts{d}] | 
|  | 191          -D INT    maximum depth        [$opts{D}] | 
|  | 192          -Q INT    min RMS mapQ         [$opts{Q}] | 
|  | 193          -G INT    minimum indel score  [$opts{G}] | 
|  | 194          -l INT    indel filter winsize [$opts{l}]\n | 
|  | 195 /) if (@ARGV == 0 && -t STDIN); | 
|  | 196 | 
|  | 197   my ($last_chr, $seq, $qual, @gaps, $last_pos); | 
|  | 198   my $_Q = $opts{Q}; | 
|  | 199   my $_d = $opts{d}; | 
|  | 200   my $_D = $opts{D}; | 
|  | 201 | 
|  | 202   $last_chr = ''; | 
|  | 203   while (<>) { | 
|  | 204 	my @t = split; | 
|  | 205 	if ($last_chr ne $t[0]) { | 
|  | 206 	  &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); | 
|  | 207 	  $last_chr = $t[0]; | 
|  | 208 	  $last_pos = 0; | 
|  | 209 	  $seq = ''; $qual = ''; | 
|  | 210 	  @gaps = (); | 
|  | 211 	} | 
|  | 212 	if ($t[1] - $last_pos != 1) { | 
|  | 213 	  $seq .= 'n' x ($t[1] - $last_pos - 1); | 
|  | 214 	  $qual .= '!' x ($t[1] - $last_pos - 1); | 
|  | 215 	} | 
|  | 216 	if ($t[2] eq '*') { | 
|  | 217 	  push(@gaps, $t[1]) if ($t[5] >= $opts{G}); | 
|  | 218 	} else { | 
|  | 219 	  $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]); | 
|  | 220 	  my $q = $t[4] + 33; | 
|  | 221 	  $q = 126 if ($q > 126); | 
|  | 222 	  $qual .= chr($q); | 
|  | 223 	} | 
|  | 224 	$last_pos = $t[1]; | 
|  | 225   } | 
|  | 226   &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); | 
|  | 227 } | 
|  | 228 | 
|  | 229 sub p2q_post_process { | 
|  | 230   my ($chr, $seq, $qual, $gaps, $l) = @_; | 
|  | 231   &p2q_filter_gaps($seq, $gaps, $l); | 
|  | 232   print "\@$chr\n"; &p2q_print_str($seq); | 
|  | 233   print "+\n"; &p2q_print_str($qual); | 
|  | 234 } | 
|  | 235 | 
|  | 236 sub p2q_filter_gaps { | 
|  | 237   my ($seq, $gaps, $l) = @_; | 
|  | 238   for my $g (@$gaps) { | 
|  | 239 	my $x = $g > $l? $g - $l : 0; | 
|  | 240 	substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l)); | 
|  | 241   } | 
|  | 242 } | 
|  | 243 | 
|  | 244 sub p2q_print_str { | 
|  | 245   my ($s) = @_; | 
|  | 246   my $l = length($$s); | 
|  | 247   for (my $i = 0; $i < $l; $i += 60) { | 
|  | 248 	print substr($$s, $i, 60), "\n"; | 
|  | 249   } | 
|  | 250 } | 
|  | 251 | 
|  | 252 # | 
|  | 253 # sam2fq | 
|  | 254 # | 
|  | 255 | 
|  | 256 sub sam2fq { | 
|  | 257   my %opts = (n=>20, p=>''); | 
|  | 258   getopts('n:p:', \%opts); | 
|  | 259   die("Usage: samtools.pl sam2fq [-n 20] [-p <prefix>] <inp.sam>\n") if (@ARGV == 0 && -t STDIN); | 
|  | 260   if ($opts{p} && $opts{n} > 1) { | 
|  | 261 	my $pre = $opts{p}; | 
|  | 262 	my @fh; | 
|  | 263 	for (0 .. $opts{n}-1) { | 
|  | 264 	  open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die; | 
|  | 265 	} | 
|  | 266 	my $i = 0; | 
|  | 267 	while (<>) { | 
|  | 268 	  next if (/^@/); | 
|  | 269 	  chomp; | 
|  | 270 	  my @t = split("\t"); | 
|  | 271 	  next if ($t[9] eq '*'); | 
|  | 272 	  my ($name, $seq, $qual); | 
|  | 273 	  if ($t[1] & 16) { # reverse strand | 
|  | 274 		$seq = reverse($t[9]); | 
|  | 275 		$qual = reverse($t[10]); | 
|  | 276 		$seq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 277 	  } else { | 
|  | 278 		($seq, $qual) = @t[9,10]; | 
|  | 279 	  } | 
|  | 280 	  $name = $t[0]; | 
|  | 281 	  $name .= "/1" if ($t[1] & 0x40); | 
|  | 282 	  $name .= "/2" if ($t[1] & 0x80); | 
|  | 283 	  print {$fh[$i]} "\@$name\n$seq\n"; | 
|  | 284 	  if ($qual ne '*') { | 
|  | 285 		print {$fh[$i]} "+\n$qual\n"; | 
|  | 286 	  } | 
|  | 287 	  $i = 0 if (++$i == $opts{n}); | 
|  | 288 	} | 
|  | 289 	close($fh[$_]) for (0 .. $opts{n}-1); | 
|  | 290   } else { | 
|  | 291 	die("To be implemented.\n"); | 
|  | 292   } | 
|  | 293 } | 
|  | 294 | 
|  | 295 # | 
|  | 296 # sra2hdr | 
|  | 297 # | 
|  | 298 | 
|  | 299 # This subroutine does not use an XML parser. It requires that the SRA | 
|  | 300 # XML files are properly formated. | 
|  | 301 sub sra2hdr { | 
|  | 302   my %opts = (); | 
|  | 303   getopts('', \%opts); | 
|  | 304   die("Usage: samtools.pl sra2hdr <SRA.prefix>\n") if (@ARGV == 0); | 
|  | 305   my $pre = $ARGV[0]; | 
|  | 306   my $fh; | 
|  | 307   # read sample | 
|  | 308   my $sample = 'UNKNOWN'; | 
|  | 309   open($fh, "$pre.sample.xml") || die; | 
|  | 310   while (<$fh>) { | 
|  | 311 	$sample = $1 if (/<SAMPLE.*alias="([^"]+)"/i); | 
|  | 312   } | 
|  | 313   close($fh); | 
|  | 314   # read experiment | 
|  | 315   my (%exp2lib, $exp); | 
|  | 316   open($fh, "$pre.experiment.xml") || die; | 
|  | 317   while (<$fh>) { | 
|  | 318 	if (/<EXPERIMENT.*accession="([^\s"]+)"/i) { | 
|  | 319 	  $exp = $1; | 
|  | 320 	} elsif (/<LIBRARY_NAME>\s*(\S+)\s*<\/LIBRARY_NAME>/i) { | 
|  | 321 	  $exp2lib{$exp} = $1; | 
|  | 322 	} | 
|  | 323   } | 
|  | 324   close($fh); | 
|  | 325   # read run | 
|  | 326   my ($run, @fn); | 
|  | 327   open($fh, "$pre.run.xml") || die; | 
|  | 328   while (<$fh>) { | 
|  | 329 	if (/<RUN.*accession="([^\s"]+)"/i) { | 
|  | 330 	  $run = $1; @fn = (); | 
|  | 331 	} elsif (/<EXPERIMENT_REF.*accession="([^\s"]+)"/i) { | 
|  | 332 	  print "\@RG\tID:$run\tSM:$sample\tLB:$exp2lib{$1}\n"; | 
|  | 333 	} elsif (/<FILE.*filename="([^\s"]+)"/i) { | 
|  | 334 	  push(@fn, $1); | 
|  | 335 	} elsif (/<\/RUN>/i) { | 
|  | 336 	  if (@fn == 1) { | 
|  | 337 		print STDERR "$fn[0]\t$run\n"; | 
|  | 338 	  } else { | 
|  | 339 		for (0 .. $#fn) { | 
|  | 340 		  print STDERR "$fn[$_]\t$run", "_", $_+1, "\n"; | 
|  | 341 		} | 
|  | 342 	  } | 
|  | 343 	} | 
|  | 344   } | 
|  | 345   close($fh); | 
|  | 346 } | 
|  | 347 | 
|  | 348 # | 
|  | 349 # unique | 
|  | 350 # | 
|  | 351 | 
|  | 352 sub unique { | 
|  | 353   my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3); | 
|  | 354   getopts('Qf:q:r:a:b:m', \%opts); | 
|  | 355   die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN); | 
|  | 356   my $last = ''; | 
|  | 357   my $recal_Q = !defined($opts{Q}); | 
|  | 358   my $multi_only = defined($opts{m}); | 
|  | 359   my @a; | 
|  | 360   while (<>) { | 
|  | 361 	my $score = -1; | 
|  | 362 	print $_ if (/^\@/); | 
|  | 363 	$score = $1 if (/AS:i:(\d+)/); | 
|  | 364 	my @t = split("\t"); | 
|  | 365 	next if (@t < 11); | 
|  | 366 	if ($score < 0) { # AS tag is unavailable | 
|  | 367 	  my $cigar = $t[5]; | 
|  | 368 	  my ($mm, $go, $ge) = (0, 0, 0); | 
|  | 369 	  $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg; | 
|  | 370 	  $cigar = $t[5]; | 
|  | 371 	  $cigar =~ s/(\d+)M/$mm+=$1/eg; | 
|  | 372 	  $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches... | 
|  | 373 	} | 
|  | 374 	$score = 1 if ($score < 1); | 
|  | 375 	if ($t[0] ne $last) { | 
|  | 376 	  &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); | 
|  | 377 	  $last = $t[0]; | 
|  | 378 	} | 
|  | 379 	push(@a, [$score, \@t]); | 
|  | 380   } | 
|  | 381   &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); | 
|  | 382 } | 
|  | 383 | 
|  | 384 sub unique_aux { | 
|  | 385   my ($a, $fac, $is_recal, $multi_only) = @_; | 
|  | 386   my ($max, $max2, $max_i) = (0, 0, -1); | 
|  | 387   for (my $i = 0; $i < @$a; ++$i) { | 
|  | 388 	if ($a->[$i][0] > $max) { | 
|  | 389 	  $max2 = $max; $max = $a->[$i][0]; $max_i = $i; | 
|  | 390 	} elsif ($a->[$i][0] > $max2) { | 
|  | 391 	  $max2 = $a->[$i][0]; | 
|  | 392 	} | 
|  | 393   } | 
|  | 394   if ($is_recal) { | 
|  | 395 	if (!$multi_only || @$a > 1) { | 
|  | 396 	  my $q = int($fac * ($max - $max2) / $max + .499); | 
|  | 397 	  $q = 250 if ($q > 250); | 
|  | 398 	  $a->[$max_i][1][4] = $q < 250? $q : 250; | 
|  | 399 	} | 
|  | 400   } | 
|  | 401   print join("\t", @{$a->[$max_i][1]}); | 
|  | 402   @$a = (); | 
|  | 403 } | 
|  | 404 | 
|  | 405 # | 
|  | 406 # uniqcmp: compare two SAM files | 
|  | 407 # | 
|  | 408 | 
|  | 409 sub uniqcmp { | 
|  | 410   my %opts = (q=>10, s=>100); | 
|  | 411   getopts('pq:s:', \%opts); | 
|  | 412   die("Usage: samtools.pl uniqcmp <in1.sam> <in2.sam>\n") if (@ARGV < 2); | 
|  | 413   my ($fh, %a); | 
|  | 414   warn("[uniqcmp] read the first file...\n"); | 
|  | 415   &uniqcmp_aux($ARGV[0], \%a, 0); | 
|  | 416   warn("[uniqcmp] read the second file...\n"); | 
|  | 417   &uniqcmp_aux($ARGV[1], \%a, 1); | 
|  | 418   warn("[uniqcmp] stats...\n"); | 
|  | 419   my @cnt; | 
|  | 420   $cnt[$_] = 0 for (0..9); | 
|  | 421   for my $x (keys %a) { | 
|  | 422 	my $p = $a{$x}; | 
|  | 423 	my $z; | 
|  | 424 	if (defined($p->[0]) && defined($p->[1])) { | 
|  | 425 	  $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1; | 
|  | 426 	  if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) { | 
|  | 427 		++$cnt[$z*3+0]; | 
|  | 428 	  } elsif ($p->[0][3] >= $opts{q}) { | 
|  | 429 		++$cnt[$z*3+1]; | 
|  | 430 	  } elsif ($p->[1][3] >= $opts{q}) { | 
|  | 431 		++$cnt[$z*3+2]; | 
|  | 432 	  } | 
|  | 433 	  print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", | 
|  | 434 		$p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q})); | 
|  | 435 	} elsif (defined($p->[0])) { | 
|  | 436 	  ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7]; | 
|  | 437 	  print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t", | 
|  | 438 		$p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q}); | 
|  | 439 	} else { | 
|  | 440 	  print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", | 
|  | 441 		-$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q}); | 
|  | 442 	  ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9]; | 
|  | 443 	} | 
|  | 444   } | 
|  | 445   print "Consistent (high, high):   $cnt[0]\n"; | 
|  | 446   print "Consistent (high, low ):   $cnt[1]\n"; | 
|  | 447   print "Consistent (low , high):   $cnt[2]\n"; | 
|  | 448   print "Inconsistent (high, high): $cnt[3]\n"; | 
|  | 449   print "Inconsistent (high, low ): $cnt[4]\n"; | 
|  | 450   print "Inconsistent (low , high): $cnt[5]\n"; | 
|  | 451   print "Second missing (high):     $cnt[6]\n"; | 
|  | 452   print "Second missing (low ):     $cnt[7]\n"; | 
|  | 453   print "First  missing (high):     $cnt[8]\n"; | 
|  | 454   print "First  missing (low ):     $cnt[9]\n"; | 
|  | 455 } | 
|  | 456 | 
|  | 457 sub uniqcmp_aux { | 
|  | 458   my ($fn, $a, $which) = @_; | 
|  | 459   my $fh; | 
|  | 460   $fn = "samtools view $fn |" if ($fn =~ /\.bam/); | 
|  | 461   open($fh, $fn) || die; | 
|  | 462   while (<$fh>) { | 
|  | 463 	my @t = split; | 
|  | 464 	next if (@t < 11); | 
|  | 465 #	my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0; | 
|  | 466 	my $l = 0; | 
|  | 467 	my ($x, $nm) = (0, 0); | 
|  | 468 	$nm = $1 if (/NM:i:(\d+)/); | 
|  | 469 	$_ = $t[5]; | 
|  | 470 	s/(\d+)[MI]/$x+=$1/eg; | 
|  | 471 	@{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - 4 * $nm); | 
|  | 472   } | 
|  | 473   close($fh); | 
|  | 474 } | 
|  | 475 | 
|  | 476 sub plp2vcf { | 
|  | 477   while (<>) { | 
|  | 478 	my @t = split; | 
|  | 479 	next if ($t[3] eq '*/*'); | 
|  | 480 	if ($t[2] eq '*') { # indel | 
|  | 481 	  my @s = split("/", $t[3]); | 
|  | 482 	  my (@a, @b); | 
|  | 483 	  my ($ref, $alt); | 
|  | 484 	  for (@s) { | 
|  | 485 		next if ($_ eq '*'); | 
|  | 486 		if (/^-/) { | 
|  | 487 		  push(@a, 'N'.substr($_, 1)); | 
|  | 488 		  push(@b, 'N'); | 
|  | 489 		} elsif (/^\+/) { | 
|  | 490 		  push(@a, 'N'); | 
|  | 491 		  push(@b, 'N'.substr($_, 1)); | 
|  | 492 		} | 
|  | 493 	  } | 
|  | 494 	  if ($a[0] && $a[1]) { | 
|  | 495 		if (length($a[0]) < length($a[1])) { | 
|  | 496 		  $ref = $a[1]; | 
|  | 497 		  $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]"; | 
|  | 498 		} elsif (length($a[0]) > length($a[1])) { | 
|  | 499 		  $ref = $a[0]; | 
|  | 500 		  $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]"; | 
|  | 501 		} else { | 
|  | 502 		  $ref = $a[0]; | 
|  | 503 		  $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]"; | 
|  | 504 		} | 
|  | 505 	  } else { | 
|  | 506 		$ref = $a[0]; $alt = $b[0]; | 
|  | 507 	  } | 
|  | 508 	  print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n"; | 
|  | 509 	} else { # SNP | 
|  | 510 	} | 
|  | 511   } | 
|  | 512 } | 
|  | 513 | 
|  | 514 # | 
|  | 515 # Usage | 
|  | 516 # | 
|  | 517 | 
|  | 518 sub usage { | 
|  | 519   die(qq/ | 
|  | 520 Program: samtools.pl (helper script for SAMtools) | 
|  | 521 Version: $version | 
|  | 522 Contact: Heng Li <lh3\@sanger.ac.uk>\n | 
|  | 523 Usage:   samtools.pl <command> [<arguments>]\n | 
|  | 524 Command: varFilter     filtering SNPs and short indels | 
|  | 525          pileup2fq     generate fastq from `pileup -c' | 
|  | 526          showALEN      print alignment length (ALEN) following CIGAR | 
|  | 527 \n/); | 
|  | 528 } |