annotate PsiCLASS-1.0.2/samtools-0.1.19/misc/sam2vcf.pl @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #!/usr/bin/perl -w
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 # VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 # Contact: pd3@sanger
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 # Version: 2010-04-23
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 use strict;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 use warnings;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 use Carp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 my $opts = parse_params();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 do_pileup_to_vcf($opts);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 exit;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 #---------------
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 sub error
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21 my (@msg) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 if ( scalar @msg ) { croak(@msg); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 die
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 "Options:\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 " -h, -?, --help This help message.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 " -i, --indels-only Ignore SNPs.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 " -R, --keep-ref Print reference alleles as well.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 " -s, --snps-only Ignore indels.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 " -t, --column-title <string> The column title.\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 "\n";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 sub parse_params
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 my %opts = ();
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 $opts{fh_in} = *STDIN;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 $opts{fh_out} = *STDOUT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 while (my $arg=shift(@ARGV))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 error("Unknown parameter \"$arg\". Run -h for help.\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 return \%opts;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 sub iupac_to_gtype
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 my ($ref,$base) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 my %iupac = (
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 'K' => ['G','T'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 'M' => ['A','C'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 'S' => ['C','G'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 'R' => ['A','G'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 'W' => ['A','T'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 'Y' => ['C','T'],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 );
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 if ( !exists($iupac{$base}) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 if ( $ref eq $base ) { return ('.','0/0'); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 return ($base,'1/1');
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 my $gt = $iupac{$base};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 return ("$$gt[0],$$gt[1]",'1/2');
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 sub parse_indel
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 my ($cons) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 if ( $cons=~/^-/ )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 my $len = length($');
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 return "D$len";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 elsif ( $cons=~/^\+/ ) { return "I$'"; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 elsif ( $cons eq '*' ) { return undef; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 error("FIXME: could not parse [$cons]\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 # An example of the pileup format:
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 # 1 3000011 C C 32 0 98 1 ^~, A
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 # 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 # 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 # 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 #
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 sub do_pileup_to_vcf
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 my ($opts) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 my $fh_in = $$opts{fh_in};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 my $fh_out = $$opts{fh_out};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 my ($prev_chr,$prev_pos,$prev_ref);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 my $refseq;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 my $ignore_indels = $$opts{snps_only} ? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 my $ignore_snps = $$opts{indels_only} ? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 my $keep_ref = $$opts{keep_ref} ? 1 : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 my $title = exists($$opts{title}) ? $$opts{title} : 'data';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 print $fh_out
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 qq[##fileformat=VCFv3.3\n],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 qq[##INFO=DP,1,Integer,"Total Depth"\n],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 qq[##FORMAT=GT,1,String,"Genotype"\n],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 while (my $line=<$fh_in>)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 chomp($line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 my (@items) = split(/\t/,$line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 if ( scalar @items<8 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 $ref = uc($ref);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 $cons = uc($cons);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 my ($alt,$gt);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 if ( $ref eq '*' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 # An indel is involved.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 if ( $ignore_indels )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 $prev_ref = $ref;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 $prev_pos = $pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 $prev_chr = $chr;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 next;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 $ref = $refseq->get_base($chr,$pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 $ref = uc($ref);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 else { $ref = $prev_ref; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 # One of the alleles can be a reference and it can come in arbitrary order. In some
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 # cases */* can be encountered. In such a case, look in the additional columns.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 my ($al1,$al2) = split(m{/},$cons);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 my $alt1 = parse_indel($al1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 my $alt2 = parse_indel($al2);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 if ( !$alt1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 $alt=$alt2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 $gt='0/1';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 elsif ( !$alt2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 $alt=$alt1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 $gt='0/1';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 elsif ( $alt1 eq $alt2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 $alt="$alt1";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 $gt='1/1';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 $alt="$alt1,$alt2";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 $gt='1/2';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 if ( $ignore_snps || (!$keep_ref && $ref eq $cons) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 $prev_ref = $ref;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 $prev_pos = $pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 $prev_chr = $chr;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 next;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 # SNP
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 ($alt,$gt) = iupac_to_gtype($ref,$cons);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 $prev_ref = $ref;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 $prev_pos = $pos;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 $prev_chr = $chr;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 #------------- Fasta --------------------
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 #
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 # Uses samtools to get a requested base from a fasta file. For efficiency, preloads
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 # a chunk to memory. The size of the cached sequence can be controlled by the 'size'
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 # parameter.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 #
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 package Fasta;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 use strict;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 use warnings;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 use Carp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 sub Fasta::new
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 my ($class,@args) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 my $self = {@args};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 bless $self, ref($class) || $class;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 $$self{chr} = undef;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 $$self{from} = undef;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 $$self{to} = undef;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 if ( !$$self{size} ) { $$self{size}=10_000_000; }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 bless $self, ref($class) || $class;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 return $self;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 sub read_chunk
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 my ($self,$chr,$pos) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 my $to = $pos + $$self{size};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 my @out = `$cmd`;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 if ( $? ) { $self->throw("$cmd: $!"); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 my $line = shift(@out);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 $$self{chr} = $chr;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 $$self{from} = $1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 $$self{to} = $2;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 my $chunk = '';
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 while ($line=shift(@out))
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 chomp($line);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 $chunk .= $line;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 $$self{chunk} = $chunk;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 return;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255 sub get_base
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257 my ($self,$chr,$pos) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258 if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260 $self->read_chunk($chr,$pos);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 my $idx = $pos - $$self{from};
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 return substr($$self{chunk},$idx,1);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266 sub throw
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 my ($self,@msg) = @_;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 croak(@msg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270 }