Mercurial > repos > bzeitouni > svdetect
view BAM_preprocessingPairs.pl @ 7:c8744c56e979 draft
Uploaded
author | bzeitouni |
---|---|
date | Mon, 11 Jun 2012 12:31:50 -0400 |
parents | 861783bb65d2 |
children |
line wrap: on
line source
#!/usr/bin/perl -w use strict; use warnings; use Getopt::Std; my $version = '0.4b_galaxy'; my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; my $pt_bad_mates_file=$opts{b}; #GALAXY my $pt_log_file=$opts{l}; #GALAXY my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY die(qq/ Description: Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input for SVDetect. From all pairs mapped onto the reference genome, this script outputs abnormal pairs: - mapped on two different chromosomes - with an incorrect strand orientation and\/or pair order - with an insert size distance +- sigma threshold into a file <prefix.ab.bam\/sam> sorted by read names -BAM\/SAM File input format only. Version : $version SAMtools required for BAM files Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam> Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] -d dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional) -o STRING output directory [$working_dir] \n/) if (@ARGV == 0 && -t STDIN); unless (-d $opts{o}){ mkdir $opts{o} or die; } $opts{o}.="/" if($opts{o}!~/\/$/); my $mates_file=shift(@ARGV); $mates_file=readlink($mates_file); my $bad_mates_file=(split(/\//,$mates_file))[$#_]; if($bad_mates_file=~/.(s|b)am$/){ $bad_mates_file=~s/.(b|s)am$/.ab.sam/; $bad_mates_file=$opts{o}.$bad_mates_file; } else{ die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; } my $good_mates_file; if($opts{d}){ $good_mates_file=(split(/\//,$mates_file))[$#_]; $good_mates_file=~s/.(b|s)am$/.norm.sam/; $good_mates_file=$opts{o}.$good_mates_file; } my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY #------------------------------------------------------------------------------# #Calculate mu and sigma open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; print LOG "\# Calculating mu and sigma lengths...\n"; print LOG "-- file=$mates_file\n"; print LOG "-- n=$opts{n}\n"; print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; my ($record, $sumX,$sumX2) = (0,0,0); my $warn=$opts{n}/10; my $prev_pair="FIRST"; my $bam=($mates_file =~ /.bam$/)? 1:0; if($bam){ open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; }else{ open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; } while(<MATES>){ my @t=split; next if ($t[0]=~/^@/); my $current_pair=$t[0]; next if($current_pair eq $prev_pair); $prev_pair=$current_pair; my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); next if ($chr1 eq "*" || $chr2 eq "*"); next if ($length<$opts{s} || $length>$opts{S}) ; if($chr2 eq "="){ $sumX += $length; #add to sum and sum^2 for mean and variance calculation $sumX2 += $length*$length; $record++; } if($record>$warn){ print LOG "-- $warn pairs analysed\n"; $warn+=$warn; } last if ($record>$opts{n}); } close (MATES); $record--; my $mu = $sumX/$record; my $sigma = sqrt($sumX2/$record - $mu*$mu); print LOG "-- Total : $record pairs analysed\n"; print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; #------------------------------------------------------------------------------# #------------------------------------------------------------------------------# #Preprocessing pairs $warn=100000; $record=0; my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); my $read_type=($opts{t})? "Illumina":"SOLiD"; my $pair_type=($opts{p})? "paired-end":"mate-paired"; print LOG "\# Preprocessing pairs...\n"; print LOG "-- file= $mates_file\n"; print LOG "-- type= $read_type $pair_type reads\n"; print LOG "-- sigma threshold= $opts{f}\n"; print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; my @header; if($bam){ open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; @header=<HEADER>; close HEADER; open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; }else{ open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; } open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; print AB @header if($bam); if($opts{d}){ open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; print NORM @header if($bam); } $prev_pair="FIRST"; my $prev_bad; while(<MATES>){ my @t=split; my $bad=0; if ($t[0]=~/^@/){ print AB; print NORM if ($opts{d}); next; } my $current_pair=$t[0]; if($current_pair eq $prev_pair){ next if($prev_bad==-1); if($prev_bad){ print AB; }elsif(!$prev_bad){ print NORM if($opts{d}); } next; } $prev_pair=$current_pair; my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); if ($chr1 eq "*" || $chr2 eq "*"){ $prev_bad=-1; $count{unmap}++; $record++; next; } my $strand1 = (($t[1]&0x0010))? 'R':'F'; my $strand2 = (($t[1]&0x0020))? 'R':'F'; my $order1 = (($t[1]&0x0040))? '1':'2'; my $order2 = (($t[1]&0x0080))? '1':'2'; if($order1 == 2){ ($strand1,$strand2)=($strand2,$strand1); ($chr1,$chr2)=($chr2,$chr1); ($pos1,$pos2)=($pos2,$pos1); ($order1,$order2)=($order2,$order1); } my $sense=$strand1.$strand2; if($chr1 ne "=" && $chr2 ne "="){ $bad=1; $count{chr}++; } if($opts{p}){ #paired-end if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ $bad=1; $count{sense}++; } }else{ #mate-pair if($opts{t}){ #Illumina if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ $bad=1; $count{sense}++; } }else{ #SOLiD if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ $bad=1; $count{sense}++; } } } if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ $bad=1; $count{dist}++; } if($bad){ print AB; $count{ab}++; $prev_bad=$bad; }else{ print NORM if ($opts{d}); $count{norm}++; $prev_bad=$bad; } $record++; if($record>$warn){ print LOG "-- $warn pairs analysed\n"; $warn+=100000; } } close AB; close NORM if($opts{d}); print LOG "-- Total : $record pairs analysed\n"; print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; print LOG "---- $count{ab} abnormal mapped pairs\n"; print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; print LOG "--- $count{norm} correct mapped pairs\n"; #------------------------------------------------------------------------------# #------------------------------------------------------------------------------# #OUTPUT if($bam){ my $bam_file=$bad_mates_file; $bam_file=~s/.sam$/.bam/; print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); unlink($bad_mates_file); print LOG "-- output created: $bam_file\n"; system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY if($opts{d}){ $bam_file=$good_mates_file; $bam_file=~s/.sam$/.bam/; print LOG "\# Converting sam to bam for correct mapped pairs\n"; system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); unlink($good_mates_file); print LOG "-- output created: $bam_file\n"; system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY } } else{ print LOG "-- output created: $bad_mates_file\n"; print LOG "-- output created: $good_mates_file\n" if($opts{d}); } close LOG; system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY #------------------------------------------------------------------------------# #------------------------------------------------------------------------------# sub decimal{ my $num=shift; my $digs_to_cut=shift; $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); return $num; } #------------------------------------------------------------------------------#