Mercurial > repos > bzeitouni > svdetect
comparison svdetect/BAM_preprocessingPairs.pl @ 27:c284618dd8da draft
Circos 1.1.0
author | bzeitouni |
---|---|
date | Tue, 06 Nov 2012 10:06:26 -0500 |
parents | f090bf6ec765 |
children |
comparison
equal
deleted
inserted
replaced
26:76046ce1ff66 | 27:c284618dd8da |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Getopt::Std; | |
6 my $version = '0.4b_galaxy'; | |
7 | |
8 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; | |
9 | |
10 my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); | |
11 | |
12 getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY | |
13 | |
14 my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; | |
15 | |
16 my $pt_bad_mates_file=$opts{b}; #GALAXY | |
17 my $pt_log_file=$opts{l}; #GALAXY | |
18 my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY | |
19 | |
20 | |
21 die(qq/ | |
22 | |
23 Description: | |
24 | |
25 Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input | |
26 for SVDetect. | |
27 | |
28 From all pairs mapped onto the reference genome, this script outputs abnormal pairs: | |
29 - mapped on two different chromosomes | |
30 - with an incorrect strand orientation and\/or pair order | |
31 - with an insert size distance +- sigma threshold | |
32 into a file <prefix.ab.bam\/sam> sorted by read names | |
33 | |
34 -BAM\/SAM File input format only. | |
35 | |
36 Version : $version | |
37 SAMtools required for BAM files | |
38 | |
39 | |
40 Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam> | |
41 | |
42 Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] | |
43 -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] | |
44 -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] | |
45 -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] | |
46 -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] | |
47 -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] | |
48 -d dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional) | |
49 -o STRING output directory [$working_dir] | |
50 | |
51 \n/) if (@ARGV == 0 && -t STDIN); | |
52 | |
53 unless (-d $opts{o}){ | |
54 mkdir $opts{o} or die; | |
55 } | |
56 $opts{o}.="/" if($opts{o}!~/\/$/); | |
57 | |
58 my $mates_file=shift(@ARGV); | |
59 | |
60 $mates_file=readlink($mates_file); | |
61 | |
62 my $bad_mates_file=(split(/\//,$mates_file))[$#_]; | |
63 | |
64 if($bad_mates_file=~/.(s|b)am$/){ | |
65 $bad_mates_file=~s/.(b|s)am$/.ab.sam/; | |
66 $bad_mates_file=$opts{o}.$bad_mates_file; | |
67 } | |
68 | |
69 else{ | |
70 die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; | |
71 } | |
72 | |
73 my $good_mates_file; | |
74 if($opts{d}){ | |
75 $good_mates_file=(split(/\//,$mates_file))[$#_]; | |
76 $good_mates_file=~s/.(b|s)am$/.norm.sam/; | |
77 $good_mates_file=$opts{o}.$good_mates_file; | |
78 } | |
79 | |
80 my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY | |
81 | |
82 #------------------------------------------------------------------------------# | |
83 #Calculate mu and sigma | |
84 | |
85 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; | |
86 | |
87 print LOG "\# Calculating mu and sigma lengths...\n"; | |
88 print LOG "-- file=$mates_file\n"; | |
89 print LOG "-- n=$opts{n}\n"; | |
90 print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; | |
91 | |
92 my ($record, $sumX,$sumX2) = (0,0,0); | |
93 my $warn=$opts{n}/10; | |
94 my $prev_pair="FIRST"; | |
95 | |
96 my $bam=($mates_file =~ /.bam$/)? 1:0; | |
97 | |
98 if($bam){ | |
99 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
100 }else{ | |
101 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
102 } | |
103 | |
104 while(<MATES>){ | |
105 | |
106 my @t=split; | |
107 | |
108 next if ($t[0]=~/^@/); | |
109 | |
110 my $current_pair=$t[0]; | |
111 next if($current_pair eq $prev_pair); | |
112 $prev_pair=$current_pair; | |
113 | |
114 my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); | |
115 | |
116 next if ($chr1 eq "*" || $chr2 eq "*"); | |
117 next if ($length<$opts{s} || $length>$opts{S}) ; | |
118 | |
119 if($chr2 eq "="){ | |
120 | |
121 $sumX += $length; #add to sum and sum^2 for mean and variance calculation | |
122 $sumX2 += $length*$length; | |
123 $record++; | |
124 } | |
125 | |
126 if($record>$warn){ | |
127 print LOG "-- $warn pairs analysed\n"; | |
128 $warn+=$warn; | |
129 } | |
130 | |
131 last if ($record>$opts{n}); | |
132 | |
133 } | |
134 close (MATES); | |
135 | |
136 $record--; | |
137 my $mu = $sumX/$record; | |
138 my $sigma = sqrt($sumX2/$record - $mu*$mu); | |
139 | |
140 print LOG "-- Total : $record pairs analysed\n"; | |
141 print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; | |
142 | |
143 #------------------------------------------------------------------------------# | |
144 #------------------------------------------------------------------------------# | |
145 #Preprocessing pairs | |
146 | |
147 $warn=100000; | |
148 | |
149 $record=0; | |
150 my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); | |
151 | |
152 my $read_type=($opts{t})? "Illumina":"SOLiD"; | |
153 my $pair_type=($opts{p})? "paired-end":"mate-paired"; | |
154 | |
155 print LOG "\# Preprocessing pairs...\n"; | |
156 print LOG "-- file= $mates_file\n"; | |
157 print LOG "-- type= $read_type $pair_type reads\n"; | |
158 print LOG "-- sigma threshold= $opts{f}\n"; | |
159 print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; | |
160 | |
161 my @header; | |
162 | |
163 if($bam){ | |
164 open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
165 @header=<HEADER>; | |
166 close HEADER; | |
167 open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
168 }else{ | |
169 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
170 } | |
171 | |
172 open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; | |
173 print AB @header if($bam); | |
174 | |
175 if($opts{d}){ | |
176 open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; | |
177 print NORM @header if($bam); | |
178 } | |
179 | |
180 $prev_pair="FIRST"; | |
181 my $prev_bad; | |
182 | |
183 while(<MATES>){ | |
184 | |
185 my @t=split; | |
186 my $bad=0; | |
187 | |
188 if ($t[0]=~/^@/){ | |
189 print AB; | |
190 print NORM if ($opts{d}); | |
191 next; | |
192 } | |
193 | |
194 my $current_pair=$t[0]; | |
195 if($current_pair eq $prev_pair){ | |
196 next if($prev_bad==-1); | |
197 if($prev_bad){ | |
198 print AB; | |
199 }elsif(!$prev_bad){ | |
200 print NORM if($opts{d}); | |
201 } | |
202 next; | |
203 } | |
204 | |
205 $prev_pair=$current_pair; | |
206 | |
207 my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); | |
208 | |
209 if ($chr1 eq "*" || $chr2 eq "*"){ | |
210 $prev_bad=-1; | |
211 $count{unmap}++; | |
212 $record++; | |
213 next; | |
214 | |
215 } | |
216 | |
217 my $strand1 = (($t[1]&0x0010))? 'R':'F'; | |
218 my $strand2 = (($t[1]&0x0020))? 'R':'F'; | |
219 my $order1 = (($t[1]&0x0040))? '1':'2'; | |
220 my $order2 = (($t[1]&0x0080))? '1':'2'; | |
221 | |
222 if($order1 == 2){ | |
223 ($strand1,$strand2)=($strand2,$strand1); | |
224 ($chr1,$chr2)=($chr2,$chr1); | |
225 ($pos1,$pos2)=($pos2,$pos1); | |
226 ($order1,$order2)=($order2,$order1); | |
227 } | |
228 | |
229 my $sense=$strand1.$strand2; | |
230 | |
231 if($chr1 ne "=" && $chr2 ne "="){ | |
232 $bad=1; | |
233 $count{chr}++; | |
234 } | |
235 | |
236 if($opts{p}){ #paired-end | |
237 if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ | |
238 $bad=1; | |
239 $count{sense}++; | |
240 } | |
241 }else{ #mate-pair | |
242 if($opts{t}){ #Illumina | |
243 if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ | |
244 $bad=1; | |
245 $count{sense}++; | |
246 } | |
247 }else{ #SOLiD | |
248 if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ | |
249 $bad=1; | |
250 $count{sense}++; | |
251 } | |
252 } | |
253 } | |
254 | |
255 if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ | |
256 $bad=1; | |
257 $count{dist}++; | |
258 } | |
259 | |
260 if($bad){ | |
261 print AB; | |
262 $count{ab}++; | |
263 $prev_bad=$bad; | |
264 }else{ | |
265 print NORM if ($opts{d}); | |
266 $count{norm}++; | |
267 $prev_bad=$bad; | |
268 } | |
269 | |
270 $record++; | |
271 | |
272 if($record>$warn){ | |
273 print LOG "-- $warn pairs analysed\n"; | |
274 $warn+=100000; | |
275 } | |
276 } | |
277 | |
278 close AB; | |
279 close NORM if($opts{d}); | |
280 | |
281 print LOG "-- Total : $record pairs analysed\n"; | |
282 print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; | |
283 print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; | |
284 print LOG "---- $count{ab} abnormal mapped pairs\n"; | |
285 print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; | |
286 print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; | |
287 print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; | |
288 print LOG "--- $count{norm} correct mapped pairs\n"; | |
289 | |
290 #------------------------------------------------------------------------------# | |
291 #------------------------------------------------------------------------------# | |
292 #OUTPUT | |
293 | |
294 if($bam){ | |
295 | |
296 my $bam_file=$bad_mates_file; | |
297 $bam_file=~s/.sam$/.bam/; | |
298 print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; | |
299 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | |
300 unlink($bad_mates_file); | |
301 print LOG "-- output created: $bam_file\n"; | |
302 | |
303 system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY | |
304 | |
305 if($opts{d}){ | |
306 $bam_file=$good_mates_file; | |
307 $bam_file=~s/.sam$/.bam/; | |
308 print LOG "\# Converting sam to bam for correct mapped pairs\n"; | |
309 system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); | |
310 unlink($good_mates_file); | |
311 print LOG "-- output created: $bam_file\n"; | |
312 | |
313 system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY | |
314 | |
315 } | |
316 | |
317 } | |
318 | |
319 else{ | |
320 print LOG "-- output created: $bad_mates_file\n"; | |
321 print LOG "-- output created: $good_mates_file\n" if($opts{d}); | |
322 } | |
323 | |
324 close LOG; | |
325 | |
326 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | |
327 | |
328 | |
329 #------------------------------------------------------------------------------# | |
330 #------------------------------------------------------------------------------# | |
331 sub decimal{ | |
332 | |
333 my $num=shift; | |
334 my $digs_to_cut=shift; | |
335 | |
336 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | |
337 | |
338 return $num; | |
339 } | |
340 #------------------------------------------------------------------------------# |