# HG changeset patch # User bzeitouni # Date 1339432218 14400 # Node ID 861783bb65d2cb1dc0e577bb58c79e347ae5a0e3 # Parent 89b207100214a3dddc929aca435e4c652adffea7 Uploaded diff -r 89b207100214 -r 861783bb65d2 BAM_preprocessingPairs.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BAM_preprocessingPairs.pl Mon Jun 11 12:30:18 2012 -0400 @@ -0,0 +1,340 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; +my $version = '0.4b_galaxy'; + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; + +my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); + +getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY + +my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; + +my $pt_bad_mates_file=$opts{b}; #GALAXY +my $pt_log_file=$opts{l}; #GALAXY +my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY + + +die(qq/ + +Description: + + Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input + for SVDetect. + + From all pairs mapped onto the reference genome, this script outputs abnormal pairs: + - mapped on two different chromosomes + - with an incorrect strand orientation and\/or pair order + - with an insert size distance +- sigma threshold + into a file sorted by read names + + -BAM\/SAM File input format only. + + Version : $version + SAMtools required for BAM files + + +Usage: BAM_preprocessingPairs.pl [options] + +Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] + -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] + -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] + -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] + -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] + -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] + -d dump normal pairs into a file [] (optional) + -o STRING output directory [$working_dir] + +\n/) if (@ARGV == 0 && -t STDIN); + +unless (-d $opts{o}){ + mkdir $opts{o} or die; +} +$opts{o}.="/" if($opts{o}!~/\/$/); + +my $mates_file=shift(@ARGV); + +$mates_file=readlink($mates_file); + +my $bad_mates_file=(split(/\//,$mates_file))[$#_]; + +if($bad_mates_file=~/.(s|b)am$/){ + $bad_mates_file=~s/.(b|s)am$/.ab.sam/; + $bad_mates_file=$opts{o}.$bad_mates_file; +} + +else{ + die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; +} + +my $good_mates_file; +if($opts{d}){ + $good_mates_file=(split(/\//,$mates_file))[$#_]; + $good_mates_file=~s/.(b|s)am$/.norm.sam/; + $good_mates_file=$opts{o}.$good_mates_file; +} + +my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY + +#------------------------------------------------------------------------------# +#Calculate mu and sigma + +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; + +print LOG "\# Calculating mu and sigma lengths...\n"; +print LOG "-- file=$mates_file\n"; +print LOG "-- n=$opts{n}\n"; +print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; + +my ($record, $sumX,$sumX2) = (0,0,0); +my $warn=$opts{n}/10; +my $prev_pair="FIRST"; + +my $bam=($mates_file =~ /.bam$/)? 1:0; + +if($bam){ + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +while(){ + + my @t=split; + + next if ($t[0]=~/^@/); + + my $current_pair=$t[0]; + next if($current_pair eq $prev_pair); + $prev_pair=$current_pair; + + my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); + + next if ($chr1 eq "*" || $chr2 eq "*"); + next if ($length<$opts{s} || $length>$opts{S}) ; + + if($chr2 eq "="){ + + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + $record++; + } + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=$warn; + } + + last if ($record>$opts{n}); + +} +close (MATES); + +$record--; +my $mu = $sumX/$record; +my $sigma = sqrt($sumX2/$record - $mu*$mu); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Preprocessing pairs + +$warn=100000; + +$record=0; +my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); + +my $read_type=($opts{t})? "Illumina":"SOLiD"; +my $pair_type=($opts{p})? "paired-end":"mate-paired"; + +print LOG "\# Preprocessing pairs...\n"; +print LOG "-- file= $mates_file\n"; +print LOG "-- type= $read_type $pair_type reads\n"; +print LOG "-- sigma threshold= $opts{f}\n"; +print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; + +my @header; + +if($bam){ + open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + @header=
; + close HEADER; + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; +print AB @header if($bam); + +if($opts{d}){ + open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; + print NORM @header if($bam); +} + +$prev_pair="FIRST"; +my $prev_bad; + +while(){ + + my @t=split; + my $bad=0; + + if ($t[0]=~/^@/){ + print AB; + print NORM if ($opts{d}); + next; + } + + my $current_pair=$t[0]; + if($current_pair eq $prev_pair){ + next if($prev_bad==-1); + if($prev_bad){ + print AB; + }elsif(!$prev_bad){ + print NORM if($opts{d}); + } + next; + } + + $prev_pair=$current_pair; + + my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); + + if ($chr1 eq "*" || $chr2 eq "*"){ + $prev_bad=-1; + $count{unmap}++; + $record++; + next; + + } + + my $strand1 = (($t[1]&0x0010))? 'R':'F'; + my $strand2 = (($t[1]&0x0020))? 'R':'F'; + my $order1 = (($t[1]&0x0040))? '1':'2'; + my $order2 = (($t[1]&0x0080))? '1':'2'; + + if($order1 == 2){ + ($strand1,$strand2)=($strand2,$strand1); + ($chr1,$chr2)=($chr2,$chr1); + ($pos1,$pos2)=($pos2,$pos1); + ($order1,$order2)=($order2,$order1); + } + + my $sense=$strand1.$strand2; + + if($chr1 ne "=" && $chr2 ne "="){ + $bad=1; + $count{chr}++; + } + + if($opts{p}){ #paired-end + if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ + $bad=1; + $count{sense}++; + } + }else{ #mate-pair + if($opts{t}){ #Illumina + if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + }else{ #SOLiD + if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + } + } + + if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ + $bad=1; + $count{dist}++; + } + + if($bad){ + print AB; + $count{ab}++; + $prev_bad=$bad; + }else{ + print NORM if ($opts{d}); + $count{norm}++; + $prev_bad=$bad; + } + + $record++; + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=100000; + } +} + +close AB; +close NORM if($opts{d}); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; +print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; +print LOG "---- $count{ab} abnormal mapped pairs\n"; +print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; +print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; +print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; +print LOG "--- $count{norm} correct mapped pairs\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#OUTPUT + +if($bam){ + + my $bam_file=$bad_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($bad_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY + + if($opts{d}){ + $bam_file=$good_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for correct mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($good_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY + + } + +} + +else{ + print LOG "-- output created: $bad_mates_file\n"; + print LOG "-- output created: $good_mates_file\n" if($opts{d}); +} + +close LOG; + +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} +#------------------------------------------------------------------------------#