annotate SVUtil.pm @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
1 package SVUtil;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
2
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
3 use strict;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
4 use Carp;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
5 use English;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
6 use Bio::SeqIO;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
7 use Bio::DB::Sam;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
8 use File::Spec;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
9 use File::Path qw(remove_tree);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
10 use File::Temp qw/ tempfile tempdir /;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
11 use Cwd;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
12 use SCValidator qw(LEFT_CLIP RIGHT_CLIP);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
13 require Exporter;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
14 our @ISA = ('Exporter');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
15 our @EXPORT = qw(is_new_pair clip_fa_file prepare_reads parse_range
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
16 get_input_bam get_work_dir is_PCR_dup read_fa_file find_smallest_cover
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
17 get_sclip_reads rev_comp);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
18 our @EXPORT_OK = qw($use_scratch $spacer_seq $spcer_seq_len $work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
19 our $use_scratch = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
20 our $work_dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
21
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
22 sub rev_comp {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
23 my $str = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
24 $str = reverse $str;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
25 $str =~ tr/ACTG/TGAC/;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
26 return $str;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
27 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
28 # something about binormial distribution
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
29 sub choose {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
30 my ($n,$k) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
31 my ($result,$j) = (1,1);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
32
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
33 return 0 if $k>$n||$k<0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
34 $k = ($n - $k) if ($n - $k) <$k;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
35
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
36 while ($j <= $k ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
37 $result *= $n--;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
38 $result /= $j++;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
39 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
40 return $result;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
41 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
42
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
43 sub dbinorm { #desity funtion
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
44 my($n, $k, $p) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
45 return unless($n > 0 && $k >= 0 && $n >= $k && $p >= 0 && $p <=1);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
46 my $prob = log(choose($n, $k)) + ($n-$k)*log(1-$p) + $k*log($p);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
47 return exp($prob);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
48 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
49
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
50 sub find_smallest_cover {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
51 my ($c, $p, $crit) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
52 my @s_cover;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
53 for( my $i = 1; $i < $c; $i++) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
54 my $n = 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
55 while(1) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
56 my $tmp = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
57 for( my $j = 0; $j <= $i && $j < $n; $j++) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
58 $tmp += dbinorm($n, $j, $p);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
59 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
60 if( $tmp < $crit) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
61 $s_cover[$i] = $n - 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
62 last;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
63 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
64 $n++;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
65 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
66 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
67 return \@s_cover;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
68 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
69
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
70
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
71
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
72 sub read_fa_file {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
73 my $file = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
74 my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
75 my %seqs;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
76 while( my $seq=$in->next_seq()) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
77 $seqs{$seq->display_id} = $seq->seq;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
78 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
79 return \%seqs;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
80 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
81
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
82 # rmdup: remove PCR dumplication related code
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
83 # you can replace it using your own criteria
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
84 sub is_new_pair {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
85 my ($a, $sclip_len, $pairs) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
86 foreach my $p (@{$pairs}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
87 return undef if($a->start == $p->[0] && $a->end == $p->[1] &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
88 $a->mate_start == $p->[2] && $a->mate_end == $p->[3] && $sclip_len == $p->[4]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
89 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
90 return 1;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
91 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
92
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
93 #
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
94 #clipping sequnce related code
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
95 #
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
96 # the spacer sequence, you can replace it with your own one
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
97 my $spacer_seq = 'ACGTACGT' . 'CGTA' x 6 . 'ATGCATGC';
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
98 my $spacer_seq_len = length($spacer_seq);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
99
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
100 # clip the reads that with spacer_seq at one end
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
101 sub clip_fa_file {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
102 my ($file, $ort) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
103 my $out = $file . ".clip.fa";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
104 my $in = Bio::SeqIO->new( -file => $file, -format => 'Fasta');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
105 open my $OUT, ">$out" or croak "can't open $file.fa:$OS_ERROR";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
106 while( my $seq=$in->next_seq()) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
107 print $OUT ">", $seq->display_id, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
108 if($ort eq RIGHT_CLIP) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
109 print $OUT $seq->subseq($spacer_seq_len + 1, $seq->length), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
110 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
111 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
112 print $OUT $seq->subseq(1, $seq->length - $spacer_seq_len), "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
113 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
114 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
115 close($OUT);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
116 return $out;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
117 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
118
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
119 # prepare the reads into a fasta file
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
120 # due to most assembler can't handle short read well, we add
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
121 # the spacer_seq to the start/end of each sequence arrording to
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
122 # the softclip postion
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
123 sub prepare_reads {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
124 my($r_reads, $clip) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
125 my ($fh, $fa_name) = tempfile( DIR => $work_dir, SUFFIX => '.fa');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
126 my $qual_name = $fa_name . ".qual";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
127 my $spacer_qual = ' 50 ' x $spacer_seq_len;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
128
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
129 open my $qual_fh, ">$qual_name" or croak "can't open $qual_name:$OS_ERROR";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
130 foreach my $r (@{$r_reads}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
131 print $fh ">", $r->{name}, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
132 my $tmp = $r->{seq};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
133 $tmp = defined $clip ?
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
134 ($clip == RIGHT_CLIP ? $spacer_seq . $tmp : $tmp . $spacer_seq) :
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
135 $tmp;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
136 print $fh $tmp, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
137 next unless $r->{qual};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
138 print $qual_fh ">", $r->{name}, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
139 $tmp = join(" ", @{$r->{qual}});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
140 $tmp = defined $clip ?
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
141 ($clip == RIGHT_CLIP ? $spacer_qual . $tmp : $tmp . $spacer_qual) :
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
142 $tmp;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
143 print $qual_fh $tmp, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
144 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
145 close $qual_fh;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
146 return $fa_name;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
147 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
148
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
149 # check PCR duplicate
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
150 sub is_PCR_dup {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
151 my ($a, $pairs, $sclip_len) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
152 my ($s, $e, $ms, $me ) = ($a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
153 $a->mate_end ? $a->mate_end : 0);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
154 foreach my $p (@{$pairs}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
155 return 1 if($s == $p->[0] && $e == $p->[1] &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
156 $ms == $p->[2] && $me == $p->[3] && $sclip_len == $p->[4]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
157 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
158 return;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
159 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
160
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
161 # parse the range of input, format is: chr:start-end
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
162 # start and end is optional
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
163 sub parse_range {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
164 my $range = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
165 my ($chr, $start, $end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
166 my @field = split /:/, $range;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
167 $chr = $field[0];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
168 # $chr = substr($chr, 3) if($chr =~ /^chr/);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
169 if(@field > 1) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
170 @field = split /-/, $field[1];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
171 $start = $field[0];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
172 $end = $field[1] if($field[1]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
173 if($start !~ m/^\d+$/ or $end !~ m/^\d+$/) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
174 croak "wrong range format, need to be: chr:start-end";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
175 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
176 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
177 return ($chr, $start, $end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
178 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
179
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
180 # St. Jude only
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
181 # grab the bam file from the bam dir
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
182 my $raw_bam_dir = "/nfs_exports/genomes/1/PCGP/BucketRaw";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
183 sub get_input_bam {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
184 my $sample = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
185 my ($group) = $sample =~ /(.*?)\d+/;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
186 my $tmp_bam_dir = File::Spec->catdir($raw_bam_dir, $group);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
187 opendir(my $dh, $tmp_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
188 my @files = grep { /^$sample-.*bam$/ } readdir($dh);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
189 close $dh;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
190 return File::Spec->catfile($raw_bam_dir, $group, $files[0]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
191 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
192
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
193 #scratch related code
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
194 sub get_work_dir {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
195 my %options = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
196 my $scratch_dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
197 if($options{-SCRATCH}){
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
198 $scratch_dir = (exists $options{-DIR} ? $options{-DIR} :
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
199 (-e '/scratch_space' ? '/scratch_space' : undef) );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
200 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
201
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
202 my $dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
203 if($scratch_dir) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
204 $scratch_dir = File::Spec->rel2abs($scratch_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
205 if($ENV{'JOB_ID'}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
206 $dir = File::Spec->catdir($scratch_dir, $ENV{'JOB_ID'})
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
207 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
208 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
209 $dir = tempdir(DIR => $scratch_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
210 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
211 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
212 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
213 $dir = tempdir();
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
214 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
215 mkdir($dir) if( ! -e $dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
216 $work_dir = $dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
217 return $dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
218 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
219
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
220
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
221 sub get_sclip_reads {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
222 my %args = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
223 my ($sam, $chr, $start, $end, $clip, $min_len, $validator, $paired, $rmdup, $full_seq, $extra_base) =
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
224 ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-CLIP},
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
225 $args{-MINCLIPLEN}, $args{-VALIDATOR}, $args{-PAIRED}, $args{-RMDUP},
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
226 $args{-FULLSEQ}, $args{-EXTRA});
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
227 $extra_base = 0 unless $extra_base;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
228 my @rtn;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
229 $min_len = 0 unless $min_len;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
230 my $max_sclip_len = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
231 my $range = $chr;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
232 $range = $chr . ":" . $start . "-" . $end if($start && $end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
233 my @pairs;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
234 my $strand_validator = $paired ? 1 : 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
235
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
236 $sam->fetch($range, sub {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
237 my $a = shift;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
238 my $cigar_str = $a->cigar_str;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
239 my @cigar_array = @{$a->cigar_array};
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
240 return if($cigar_str !~ m/S/);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
241 return unless($a->start && $a->end);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
242 #return if(!$a->proper_pair && $paired);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
243 if($paired && !$strand_validator) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
244 $validator->add_validator("strand_validator");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
245 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
246
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
247 if($paired && !$a->proper_pair) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
248 $validator->remove_validator("strand_validator");
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
249 $strand_validator = 0;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
250 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
251 my ($sclip_len, $pos, $seq, $qual);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
252 my @tmp = $a->qscore;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
253 if($cigar_array[0]->[0] eq 'S' && $clip == LEFT_CLIP ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
254 $pos = $a->start;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
255 return if($pos < $start or $pos > $end); # the softclipping position is not in range
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
256 $sclip_len = $cigar_array[0]->[1];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
257 #print $cigar_str, "\t", scalar @pairs, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
258 return unless($validator && $validator->validate($a, LEFT_CLIP));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
259 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
260 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
261 $seq = $full_seq ? $a->query->dna : substr($a->query->dna, 0, $sclip_len + $extra_base);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
262 @tmp = $full_seq ? @tmp : @tmp[0 .. ($sclip_len -1 + $extra_base)];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
263
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
264 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
265 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
266 my $qscore = $a->qscore;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
267
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
268 push @rtn, {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
269 name => $a->qname,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
270 seq => $seq,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
271 qual => \@tmp,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
272 full_seq => $a->query->dna,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
273 full_qual => $qscore,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
274 pos => $pos,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
275 };
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
276 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
277 if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
278 $pos = $a->end;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
279 return if($pos < $start or $pos > $end); # the softclipping position is not in range
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
280 # print $cigar_str, "\t", scalar @pairs, "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
281 return unless $validator->validate($a, RIGHT_CLIP);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
282 $sclip_len = $cigar_array[$#cigar_array]->[1];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
283 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
284 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
285 $seq = $full_seq ? $a->qseq : substr($a->qseq, $a->l_qseq - $sclip_len - $extra_base );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
286 @tmp = $full_seq ? @tmp : @tmp[($a->l_qseq - $sclip_len - $extra_base) .. ($a->l_qseq - 1)];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
287 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
288 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
289 my $qscore = $a->qscore;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
290 push @rtn, {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
291 name => $a->qname,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
292 seq => $seq,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
293 qual => \@tmp,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
294 full_seq => $a->query->dna,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
295 full_qual => $qscore,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
296 pos => $pos,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
297 };
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
298 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
299 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
300 );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
301 if($max_sclip_len >= $min_len) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
302 return @rtn;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
303 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
304 else{
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
305 undef @rtn; return @rtn;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
306 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
307 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
308
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
309 my $start_dir;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
310 BEGIN {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
311 $start_dir = getcwd;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
312 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
313 END {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
314 chdir($start_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
315 remove_tree($work_dir) if($use_scratch && $work_dir && $start_dir ne $work_dir);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
316 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
317
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
318 1;