annotate tools/samtools/pileup_parser.pl @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #! /usr/bin/perl -w
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 use strict;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 use POSIX;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 my $in_file = $ARGV[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 my $ref_base_column = $ARGV[1]-1; # 1 based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 my $read_bases_column = $ARGV[2]-1; # 1 based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 my $base_quality_column = $ARGV[3]-1; # 1 based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 my $cvrg_column = $ARGV[4]-1; # 1 based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 my $quality_cutoff = $ARGV[5]; # phred scale integer
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 my $cvrg_cutoff = $ARGV[6]; # unsigned integer
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 my $coord_column = $ARGV[9]-1; #1 based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 my $out_file = $ARGV[10];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 my $invalid_line_counter = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 my $first_skipped_line = "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 my %SNPs = ('A',0,'T',0,'C',0,'G',0);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 my $above_qv_bases = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 my $SNPs_exist = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 my $out_string = "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 my $diff_count = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 open (IN, "<$in_file") or die "Cannot open $in_file $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 open (OUT, ">$out_file") or die "Cannot open $out_file $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 while (<IN>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 chop;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 next if m/^\#/;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 my @fields = split /\t/;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 next if $fields[ $ref_base_column ] eq "*"; # skip indel lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 my $read_bases = $fields[ $read_bases_column ];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] );
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 next if $fields[ $cvrg_column ] < $cvrg_cutoff;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 my $base_quality = $fields[ $base_quality_column ];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 if ($read_bases =~ m/[\$\^\+-]/) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 $read_bases =~ s/\^.//g; #removing the start of the read segement mark
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 $read_bases =~ s/\$//g; #removing end of the read segment mark
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 while ($read_bases =~ m/[\+-]{1}(\d+)/g) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 my $indel_len = $1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 if ( length($read_bases) != length($base_quality) ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 $first_skipped_line = $. if $first_skipped_line eq "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 ++$invalid_line_counter;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 next;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # after removing read block and indel data the length of read_base
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 # field should identical to the length of base_quality field
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 my @bases = split //, $read_bases;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 my @qv = split //, $base_quality;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 for my $base ( 0 .. @bases - 1 ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 ++$above_qv_bases;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 if ( $bases[ $base ] =~ m/[ATGC]/i )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 $SNPs_exist = 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 $SNPs{ uc( $bases[ $base ] ) } += 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 $diff_count += 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 } elsif ( $bases[ $base ] =~ m/[\.,]/ ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if ($bed eq "Yes") {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 my $start = $fields[ $coord_column ] - 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 my $end = $fields[ $coord_column ];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 $fields[ $coord_column ] = "$start\t$end";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if ($print_qual_bases ne "Yes") {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 $fields[ $base_quality_column ] = "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 $fields[ $read_bases_column ] = "";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 $out_string = join("\t", @fields); # \t$read_bases\t$base_quality";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 foreach my $SNP (sort keys %SNPs) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 $out_string .= "\t$SNPs{$SNP}";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 if ($total_diff eq "Yes") {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 $out_string .= "\t$above_qv_bases\t$diff_count\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 } else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 $out_string .= "\t$above_qv_bases\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 $out_string =~ s/\t+/\t/g;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 if ( $SNPs_only eq "Yes" ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 print OUT $out_string if $SNPs_exist == 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 } else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 print OUT $out_string;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 %SNPs = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 %SNPs = ('A',0,'T',0,'C',0,'G',0);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 $above_qv_bases = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 $SNPs_exist = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 $diff_count = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 close IN;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 close OUT;