Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/sam2vcf.pl @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 | |
4 # | |
5 # Contact: pd3@sanger | |
6 # Version: 2010-04-23 | |
7 | |
8 use strict; | |
9 use warnings; | |
10 use Carp; | |
11 | |
12 my $opts = parse_params(); | |
13 do_pileup_to_vcf($opts); | |
14 | |
15 exit; | |
16 | |
17 #--------------- | |
18 | |
19 sub error | |
20 { | |
21 my (@msg) = @_; | |
22 if ( scalar @msg ) { croak(@msg); } | |
23 die | |
24 "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n", | |
25 "Options:\n", | |
26 " -h, -?, --help This help message.\n", | |
27 " -i, --indels-only Ignore SNPs.\n", | |
28 " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n", | |
29 " -R, --keep-ref Print reference alleles as well.\n", | |
30 " -s, --snps-only Ignore indels.\n", | |
31 " -t, --column-title <string> The column title.\n", | |
32 "\n"; | |
33 } | |
34 | |
35 | |
36 sub parse_params | |
37 { | |
38 my %opts = (); | |
39 | |
40 $opts{fh_in} = *STDIN; | |
41 $opts{fh_out} = *STDOUT; | |
42 | |
43 while (my $arg=shift(@ARGV)) | |
44 { | |
45 if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; } | |
46 if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; } | |
47 if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; } | |
48 if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; } | |
49 if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; } | |
50 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } | |
51 | |
52 error("Unknown parameter \"$arg\". Run -h for help.\n"); | |
53 } | |
54 return \%opts; | |
55 } | |
56 | |
57 sub iupac_to_gtype | |
58 { | |
59 my ($ref,$base) = @_; | |
60 my %iupac = ( | |
61 'K' => ['G','T'], | |
62 'M' => ['A','C'], | |
63 'S' => ['C','G'], | |
64 'R' => ['A','G'], | |
65 'W' => ['A','T'], | |
66 'Y' => ['C','T'], | |
67 ); | |
68 if ( !exists($iupac{$base}) ) | |
69 { | |
70 if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); } | |
71 if ( $ref eq $base ) { return ('.','0/0'); } | |
72 return ($base,'1/1'); | |
73 } | |
74 my $gt = $iupac{$base}; | |
75 if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); } | |
76 elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); } | |
77 return ("$$gt[0],$$gt[1]",'1/2'); | |
78 } | |
79 | |
80 | |
81 sub parse_indel | |
82 { | |
83 my ($cons) = @_; | |
84 if ( $cons=~/^-/ ) | |
85 { | |
86 my $len = length($'); | |
87 return "D$len"; | |
88 } | |
89 elsif ( $cons=~/^\+/ ) { return "I$'"; } | |
90 elsif ( $cons eq '*' ) { return undef; } | |
91 error("FIXME: could not parse [$cons]\n"); | |
92 } | |
93 | |
94 | |
95 # An example of the pileup format: | |
96 # 1 3000011 C C 32 0 98 1 ^~, A | |
97 # 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0 | |
98 # 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0 | |
99 # 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0 | |
100 # | |
101 sub do_pileup_to_vcf | |
102 { | |
103 my ($opts) = @_; | |
104 | |
105 my $fh_in = $$opts{fh_in}; | |
106 my $fh_out = $$opts{fh_out}; | |
107 my ($prev_chr,$prev_pos,$prev_ref); | |
108 my $refseq; | |
109 my $ignore_indels = $$opts{snps_only} ? 1 : 0; | |
110 my $ignore_snps = $$opts{indels_only} ? 1 : 0; | |
111 my $keep_ref = $$opts{keep_ref} ? 1 : 0; | |
112 my $title = exists($$opts{title}) ? $$opts{title} : 'data'; | |
113 | |
114 print $fh_out | |
115 qq[##fileformat=VCFv3.3\n], | |
116 qq[##INFO=DP,1,Integer,"Total Depth"\n], | |
117 qq[##FORMAT=GT,1,String,"Genotype"\n], | |
118 qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n], | |
119 qq[##FORMAT=DP,1,Integer,"Read Depth"\n], | |
120 qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n] | |
121 ; | |
122 | |
123 while (my $line=<$fh_in>) | |
124 { | |
125 chomp($line); | |
126 my (@items) = split(/\t/,$line); | |
127 if ( scalar @items<8 ) | |
128 { | |
129 error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n"); | |
130 } | |
131 my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items; | |
132 $ref = uc($ref); | |
133 $cons = uc($cons); | |
134 | |
135 my ($alt,$gt); | |
136 if ( $ref eq '*' ) | |
137 { | |
138 # An indel is involved. | |
139 if ( $ignore_indels ) | |
140 { | |
141 $prev_ref = $ref; | |
142 $prev_pos = $pos; | |
143 $prev_chr = $chr; | |
144 next; | |
145 } | |
146 | |
147 if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos) | |
148 { | |
149 if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); } | |
150 if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); } | |
151 $ref = $refseq->get_base($chr,$pos); | |
152 $ref = uc($ref); | |
153 } | |
154 else { $ref = $prev_ref; } | |
155 | |
156 # One of the alleles can be a reference and it can come in arbitrary order. In some | |
157 # cases */* can be encountered. In such a case, look in the additional columns. | |
158 my ($al1,$al2) = split(m{/},$cons); | |
159 if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; } | |
160 my $alt1 = parse_indel($al1); | |
161 my $alt2 = parse_indel($al2); | |
162 if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); } | |
163 if ( !$alt1 ) | |
164 { | |
165 $alt=$alt2; | |
166 $gt='0/1'; | |
167 } | |
168 elsif ( !$alt2 ) | |
169 { | |
170 $alt=$alt1; | |
171 $gt='0/1'; | |
172 } | |
173 elsif ( $alt1 eq $alt2 ) | |
174 { | |
175 $alt="$alt1"; | |
176 $gt='1/1'; | |
177 } | |
178 else | |
179 { | |
180 $alt="$alt1,$alt2"; | |
181 $gt='1/2'; | |
182 } | |
183 } | |
184 else | |
185 { | |
186 if ( $ignore_snps || (!$keep_ref && $ref eq $cons) ) | |
187 { | |
188 $prev_ref = $ref; | |
189 $prev_pos = $pos; | |
190 $prev_chr = $chr; | |
191 next; | |
192 } | |
193 | |
194 # SNP | |
195 ($alt,$gt) = iupac_to_gtype($ref,$cons); | |
196 } | |
197 | |
198 print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; | |
199 | |
200 $prev_ref = $ref; | |
201 $prev_pos = $pos; | |
202 $prev_chr = $chr; | |
203 } | |
204 } | |
205 | |
206 | |
207 #------------- Fasta -------------------- | |
208 # | |
209 # Uses samtools to get a requested base from a fasta file. For efficiency, preloads | |
210 # a chunk to memory. The size of the cached sequence can be controlled by the 'size' | |
211 # parameter. | |
212 # | |
213 package Fasta; | |
214 | |
215 use strict; | |
216 use warnings; | |
217 use Carp; | |
218 | |
219 sub Fasta::new | |
220 { | |
221 my ($class,@args) = @_; | |
222 my $self = {@args}; | |
223 bless $self, ref($class) || $class; | |
224 if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); } | |
225 $$self{chr} = undef; | |
226 $$self{from} = undef; | |
227 $$self{to} = undef; | |
228 if ( !$$self{size} ) { $$self{size}=10_000_000; } | |
229 bless $self, ref($class) || $class; | |
230 return $self; | |
231 } | |
232 | |
233 sub read_chunk | |
234 { | |
235 my ($self,$chr,$pos) = @_; | |
236 my $to = $pos + $$self{size}; | |
237 my $cmd = "samtools faidx $$self{file} $chr:$pos-$to"; | |
238 my @out = `$cmd`; | |
239 if ( $? ) { $self->throw("$cmd: $!"); } | |
240 my $line = shift(@out); | |
241 if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } | |
242 $$self{chr} = $chr; | |
243 $$self{from} = $1; | |
244 $$self{to} = $2; | |
245 my $chunk = ''; | |
246 while ($line=shift(@out)) | |
247 { | |
248 chomp($line); | |
249 $chunk .= $line; | |
250 } | |
251 $$self{chunk} = $chunk; | |
252 return; | |
253 } | |
254 | |
255 sub get_base | |
256 { | |
257 my ($self,$chr,$pos) = @_; | |
258 if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) | |
259 { | |
260 $self->read_chunk($chr,$pos); | |
261 } | |
262 my $idx = $pos - $$self{from}; | |
263 return substr($$self{chunk},$idx,1); | |
264 } | |
265 | |
266 sub throw | |
267 { | |
268 my ($self,@msg) = @_; | |
269 croak(@msg); | |
270 } |