Mercurial > repos > jjohnson > crest
comparison extractSClip.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev'; | |
4 use Carp; | |
5 use Getopt::Long; | |
6 use English; | |
7 use Pod::Usage; | |
8 use Data::Dumper; | |
9 use Bio::DB::Sam; | |
10 use File::Temp qw/ tempfile tempdir /; | |
11 use File::Spec; | |
12 use File::Basename; | |
13 use Cwd; | |
14 use List::MoreUtils qw/ uniq /; | |
15 #custom packages | |
16 use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP); | |
17 use SVUtil qw($use_scratch $work_dir get_input_bam get_work_dir parse_range is_PCR_dup); | |
18 | |
19 use constant FQ_BASE_NUMBER => 33; | |
20 | |
21 my $rmdup = 0; | |
22 my $print_read = 1; | |
23 $use_scratch = 0; | |
24 $min_percent_id = 90; | |
25 my ($work_dir, $out_dir); | |
26 my $ref_genome; | |
27 # input/output | |
28 my ($out_prefix, $range, $input_bam ); | |
29 my $out_suffix = "sclip.txt"; | |
30 my $paired = 1; | |
31 my ( $help, $man, $version, $usage ); | |
32 my $optionOK = GetOptions( | |
33 'i|in|input=s' => \$input_bam, | |
34 'o|out_dir=s' => \$out_dir, | |
35 'ref_genome=s' => \$ref_genome, | |
36 'p=s' => \$out_prefix, | |
37 'scratch!' => \$use_scratch, | |
38 'paired!' => \$paired, | |
39 'rmdup!' => \$rmdup, | |
40 'lq_cutoff=i' => \$lowqual_cutoff, | |
41 'min_pct_id=i' => \$min_percent_id, | |
42 'min_pct_hq=i' => \$min_percent_hq, | |
43 'print_read!' => \$print_read, | |
44 'r|range=s' => \$range, | |
45 'h|help|?' => \$help, | |
46 'man' => \$man, | |
47 'usage' => \$usage, | |
48 'v|version' => \$version, | |
49 ); | |
50 | |
51 pod2usage(-verbose=>2) if($man or $usage); | |
52 pod2usage(1) if($help or $version ); | |
53 | |
54 my $start_dir = getcwd; | |
55 if($input_bam) { | |
56 croak "The bam file you specified does not exist!\n" unless(-e $input_bam); | |
57 $input_bam = File::Spec->rel2abs($input_bam); | |
58 } | |
59 else{ | |
60 croak "You must specify the input bam file or sample name"; | |
61 } | |
62 croak "You must provide the reference genome in fasta format!" if(!$ref_genome); | |
63 croak "The reference genome file you speicified does not exist!\n" | |
64 unless(-e $ref_genome); | |
65 | |
66 my $input_base = fileparse($input_bam); | |
67 | |
68 #setup output dir and workind directory | |
69 $out_dir = getcwd if(!$out_dir); | |
70 mkdir $out_dir if(!-e $out_dir || ! -d $out_dir); | |
71 $work_dir = get_work_dir() if($use_scratch); | |
72 $work_dir = $out_dir if(!$work_dir); | |
73 $use_scratch = undef if($work_dir eq $out_dir); | |
74 chdir($work_dir); | |
75 | |
76 # figure out output prefix | |
77 $out_prefix = $input_base if(!$out_prefix); | |
78 | |
79 my $sam = Bio::DB::Sam->new( -bam => $input_bam, -fasta => $ref_genome); | |
80 | |
81 my $output_file; | |
82 my $validator = SCValidator->new(); | |
83 if(!$paired) { | |
84 $validator->remove_validator("strand_validator"); | |
85 } | |
86 if($range) { | |
87 my ($chr, $start, $end) = parse_range($range); | |
88 my $tmp = $chr; | |
89 $tmp = $tmp . ".$start" if($start); | |
90 $tmp = $tmp . ".$end" if($end); | |
91 $output_file = join('.', $out_prefix, $tmp, $out_suffix); | |
92 $output_file = File::Spec->catfile($out_dir, $output_file); | |
93 | |
94 my($pcover, $ncover) = extract_range_sclip( | |
95 -SAM => $sam, | |
96 -RANGE => $range, | |
97 -WORK_DIR => $work_dir, | |
98 -OUTPUT => $output_file, | |
99 -VALIDATOR => $validator); | |
100 $output_file = join('.', $out_prefix, $tmp, "cover"); | |
101 $output_file = File::Spec->catfile($out_dir, $output_file); | |
102 warn "$output_file file exists and it will be replaced!" if(-e $output_file); | |
103 $rmdup = 0; | |
104 #$start = 0 unless $start; | |
105 #my ($tid) = $sam->header->parse_region($chr); | |
106 #my $chr_len = $sam->header->target_len->[$tid]; | |
107 #$end = $chr_len unless $end; | |
108 | |
109 #my ($coverage) = $sam->features(-type => 'coverage', -seq_id => $chr, | |
110 # -start => $start, -end => $end); | |
111 #my @cc = $coverage->coverage; | |
112 | |
113 $rmdup = 0; | |
114 my($fh, $fname) = tempfile( DIR => $work_dir); | |
115 foreach my $p (keys(%{$pcover})) { | |
116 my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p}); | |
117 print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p)), "\n"; | |
118 } | |
119 foreach my $p (keys(%{$ncover})) { | |
120 my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p}); | |
121 print $fh join("\t", $chr, $p, "-", $c , count_coverage($sam, $chr, $p)), "\n"; | |
122 } | |
123 system("sort -k 2 -n $fname -o $output_file"); | |
124 system("rm $fname"); | |
125 } | |
126 else{ | |
127 $output_file = join('.', $out_prefix, $out_suffix); | |
128 $output_file = File::Spec->catfile($out_dir, $output_file); | |
129 my $header = $sam->header; | |
130 my $target_names = $header->target_name; | |
131 $output_file = join('.', $out_prefix, "cover"); | |
132 my $read_file = join('.', $out_prefix, "sclip.txt"); | |
133 $output_file = File::Spec->catfile($out_dir, $output_file); | |
134 $read_file = File::Spec->catfile($out_dir, $read_file); | |
135 | |
136 if(-e $output_file) { | |
137 warn "$output_file file exists and it will be replaced!"; | |
138 system("rm $output_file"); | |
139 } | |
140 my @t_names = uniq @{$target_names}; | |
141 foreach my $chr (@t_names){ | |
142 my($fh, $fname) = tempfile( DIR => $work_dir); | |
143 my($pcover, $ncover) = extract_range_sclip( | |
144 -SAM => $sam, | |
145 -RANGE =>$chr, | |
146 -WORK_DIR => $work_dir, | |
147 -OUTPUT => $read_file, | |
148 -VALIDATOR => $validator); | |
149 foreach my $p (keys(%{$pcover})) { | |
150 my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p}); | |
151 print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p) ), "\n"; | |
152 } | |
153 foreach my $p (keys(%{$ncover})) { | |
154 my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p}); | |
155 print $fh join("\t", $chr, $p, "-", $c, count_coverage($sam, $chr, $p)), "\n"; | |
156 } | |
157 system("sort -k 2 -n $fname -o $fname.sorted"); | |
158 system("cat $fname.sorted >> $output_file"); | |
159 system("rm $fname"); | |
160 system("rm $fname.sorted"); | |
161 } | |
162 } | |
163 chdir $start_dir; | |
164 exit(0); | |
165 | |
166 sub count_coverage { | |
167 my ($sam, $chr, $pos, $clip) = @_; | |
168 if($rmdup) { | |
169 my @pairs; | |
170 my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos); | |
171 my $n = 0; | |
172 my $itr = $seg->features(-iterator => 1); | |
173 while( my $a = $itr->next_seq) { | |
174 my $sclip_len = 0; | |
175 if($clip) { | |
176 my @cigar_array = @{$a->cigar_array}; | |
177 $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP); | |
178 $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP); | |
179 } | |
180 next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len)); | |
181 $n++; | |
182 #return $n if( $n > $max_repetitive_cover); | |
183 if($a->mpos) { | |
184 push @pairs, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len]; | |
185 } | |
186 else { | |
187 push @pairs, [$a->start, $a->end, 0, 0, $sclip_len]; | |
188 } | |
189 | |
190 } | |
191 return $n; | |
192 } | |
193 else{ | |
194 my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, | |
195 -start => $pos, -end => $pos); | |
196 my @c_d = $c->coverage; | |
197 return $c_d[0]; | |
198 } | |
199 } | |
200 | |
201 sub extract_range_sclip { | |
202 my %arg = @_; | |
203 my $sam = $arg{-SAM} || croak "missing -SAM"; | |
204 my $range = $arg{-RANGE} || croak "missing -RANGE"; | |
205 my $output_file = $arg{-OUTPUT} || croak "missing -OUTPUT"; | |
206 my $validator = $arg{-VALIDATOR} || croak "missing -VALIDATOR"; | |
207 | |
208 my($fh, $fname) = tempfile( DIR => $work_dir); | |
209 my (%plus_cover, %neg_cover); | |
210 $sam->fetch($range, | |
211 sub { | |
212 my $a = shift; | |
213 my $cigar_str = $a->cigar_str; | |
214 # print STDERR $a->qname, "\t", $cigar_str, "\n"; | |
215 my @cigar_array = @{$a->cigar_array}; | |
216 return if($a->cigar_str !~ m/S/); | |
217 if($paired && !$a->proper_pair) { #paired but mate is not mapped | |
218 $validator->remove_validator("strand_validator"); | |
219 } | |
220 #return if(!$a->proper_pair && $paired); | |
221 #return if($paired && !$a->mpos); | |
222 my ($sclip_len, $ort, $pos, $seq, $qual_str, $qual); | |
223 $qual_str = join( "", (map { chr $_ + FQ_BASE_NUMBER } $a->qscore)); | |
224 if($cigar_array[0]->[0] eq 'S' && $validator->validate($a, LEFT_CLIP) ) { | |
225 $sclip_len = $cigar_array[0]->[1]; $ort = "-"; $pos = $a->start; | |
226 $seq = substr($a->query->dna, 0, $sclip_len ); | |
227 $qual = substr($qual_str, 0, $sclip_len); | |
228 | |
229 my $print = 1; | |
230 if($rmdup) { | |
231 if(exists $neg_cover{$pos}) { | |
232 $print = 0 if(is_PCR_dup($a, $neg_cover{$pos}, $sclip_len)); | |
233 } | |
234 else { | |
235 $neg_cover{$pos} = []; | |
236 } | |
237 if($print == 1) { | |
238 if($a->mpos) { | |
239 push @{$neg_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len]; | |
240 } | |
241 else { | |
242 push @{$neg_cover{$pos}}, [$a->start, $a->end, 0, 0, $sclip_len]; | |
243 } | |
244 } | |
245 } | |
246 else { | |
247 if(exists $neg_cover{$pos}) { | |
248 $neg_cover{$pos}++; | |
249 } | |
250 else{ | |
251 $neg_cover{$pos} = 1; | |
252 } | |
253 } | |
254 print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n" | |
255 if($print_read && $print == 1); | |
256 } | |
257 | |
258 if($cigar_array[$#cigar_array]->[0] eq 'S' && $validator->validate($a, RIGHT_CLIP)) { | |
259 $sclip_len = $cigar_array[$#cigar_array]->[1]; $ort = '+'; $pos = $a->end; | |
260 my $l = length($a->query->dna); | |
261 $seq = substr($a->query->dna, $l - $sclip_len); | |
262 $qual = substr($qual_str, $l - $sclip_len); | |
263 | |
264 my $print = 1; | |
265 if($rmdup) { | |
266 if(exists $plus_cover{$pos}) { | |
267 $print = 0 if(is_PCR_dup($a, $plus_cover{$pos}, $sclip_len)); | |
268 } | |
269 else { | |
270 $plus_cover{$pos} = []; | |
271 } | |
272 if($print ==1 ) { | |
273 if($a->mpos) { | |
274 push @{$plus_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len]; | |
275 } | |
276 else { | |
277 push @{$plus_cover{$pos}}, [$a->start, 0, 0, $a->mate_end, $sclip_len]; | |
278 } | |
279 } | |
280 } | |
281 else { | |
282 if(exists $plus_cover{$pos}) { | |
283 $plus_cover{$pos}++; | |
284 } | |
285 else{ | |
286 $plus_cover{$pos} = 1; | |
287 } | |
288 } | |
289 $print = is_PCR_dup($a, $plus_cover{$pos}, $sclip_len) if($rmdup); | |
290 print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n" | |
291 if($print_read && $print); | |
292 } | |
293 if($paired && !$a->proper_pair) { #paired but mate is not mapped, add back | |
294 $validator->add_validator("strand_validator"); | |
295 } | |
296 } | |
297 ); | |
298 if($print_read) { | |
299 system("sort -k 2 -n $fname -o $fname.sorted"); | |
300 system("cat $fname.sorted >> $output_file"); | |
301 system("rm $fname"); | |
302 system("rm $fname.sorted"); | |
303 } | |
304 return(\%plus_cover, \%neg_cover); | |
305 } | |
306 | |
307 | |
308 =head1 NAME | |
309 | |
310 extractSClip.pl - extract positions with soft clipped read in bam file. | |
311 | |
312 | |
313 =head1 VERSION | |
314 | |
315 This documentation refers to extractSClip.pl version 0.0.1. | |
316 | |
317 | |
318 =head1 USAGE | |
319 | |
320 # extract all positions with soft clipped reads in whole genome: | |
321 ./extractSClip.pl -i sample.bam -g hg18.fa | |
322 # extract chr1 positions with soft clipped reads | |
323 ./extractSClip.pl -i sample.bam -g hg18.fa -r chr1 | |
324 | |
325 | |
326 =head1 REQUIRED ARGUMENTS | |
327 | |
328 -i: Input bam file. | |
329 --ref_genome: The genome file in fa file, must be the same used to map reads. | |
330 | |
331 | |
332 =head1 OPTIONS | |
333 | |
334 -r: The range of positions need to be extracted. Format: chr1 or chr1:500-5000. | |
335 -o: The output directory, default is current directory. | |
336 --scratch: use scracth space, default off. | |
337 --rmdup: remove PCR dumplicate reads, default on, use --normdup to turn it off. | |
338 --lq_cutoff: low quality cutoff value, default 20. | |
339 --min_pct_id: minimum percent identify for the aligned high qual part,default 90. | |
340 --min_pct_hq: minimum percent high quality for soft clipped part, default 80. | |
341 --print_read: individual soft-clipped read will be printed, default off. | |
342 -h, --help: The help page. | |
343 --man: Print the man page. | |
344 --usage: Print usage information. | |
345 -v, --version: print version information. | |
346 | |
347 | |
348 =head1 DESCRIPTION | |
349 | |
350 This is a program to extract all soft-clipped positions such that for each position | |
351 a list of requirements need to be satisfied. More specifically, the orientaion of | |
352 pair-end read should be satisfied, the minimum percent identify of aligned part | |
353 need to be satisfied, and the minimum percent of hiqh quality soft-clipped part | |
354 should be satisfied. | |
355 | |
356 | |
357 =head1 DEPENDENCIES | |
358 | |
359 The program depend on several packages: | |
360 1. Bioperl perl module. | |
361 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed. | |
362 | |
363 =head1 BUGS AND LIMITATIONS | |
364 | |
365 There are no known bugs in this module, but the method is limitted to bam file | |
366 that has soft-clipping cigar string generated.Please report problems to | |
367 Jianmin Wang (Jianmin.Wang@stjude.org) | |
368 Patches are welcome. | |
369 | |
370 =head1 AUTHOR | |
371 | |
372 Jianmin Wang (Jianmin.Wang@stjude.org) | |
373 | |
374 | |
375 =head1 LICENCE AND COPYRIGHT | |
376 | |
377 Copyright (c) 2010 by St. Jude Children's Research Hospital. | |
378 | |
379 This program is free software: you can redistribute it and/or modify | |
380 it under the terms of the GNU General Public License as published by | |
381 the Free Software Foundation, either version 2 of the License, or | |
382 (at your option) any later version. | |
383 | |
384 This program is distributed in the hope that it will be useful, | |
385 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
386 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
387 GNU General Public License for more details. | |
388 | |
389 You should have received a copy of the GNU General Public License | |
390 along with this program. If not, see <http://www.gnu.org/licenses/>. |