Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
view 2.4/src/Merge_SV.pl @ 13:e3609c8714fb draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Fri, 30 May 2014 03:37:55 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w use Getopt::Long; use List::Util qw(min max); #Declare variables my ($window,$tmpSpace,$usage,$help,$outFile); GetOptions( 'v=s{2,}' => \@VCF, 'o:s' => \$outFile, 'w:s' => \$window, 'h|help' => \$help ); if((!@VCF)||($help)){&usage();exit} if (!$window) { $window=500; } if (!$outFile) { $outFile="merged.vcf.out"; } ########################################### # Protect against merging too many results ########################################### $tmpSpace='temporarySV_merge'; if (-e $tmpSpace) { #Delete temp file if it exists unlink $tmpSpace; } ########################################### #For each VCF, create a BEDPE file ########################################### open(OUT,">>$tmpSpace") or die "Can't write in this directory\n"; for (my $i=0;$i<@VCF;$i++){ #print STDERR "opening $VCF[$i]\n"; open(VCF,$VCF[$i]) or die &usage(); while (<VCF>) { next if ($_=~/^#/); chomp; @line=split("\t",$_); $mate=$line[4]; $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; @mate=split(/:/,$mate); $end1a=$line[1]-$window; $end1b=$line[1]+$window; $end2a=$mate[1]-$window; $end2b=$mate[1]+$window; next if (($end1a<0)||($end2a<0)); if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { next; } print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; } } close OUT; ########################################### #Now merge the BEDPE into a unique BEDPE ########################################### #Make sure the BEDPE is sorted #print "Make sure the BEDPE is sorted\n"; my $tmpSpace2=join("",$tmpSpace,".2"); system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2"); unlink($tmpSpace); #Create output files for the left and right merged BEDPE my $tmpSpace3=join("",$tmpSpace,".3"); my $tmpSpace4=join("",$tmpSpace,".4"); open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n"; open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n"; open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n"; #Initialize positions #my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4); my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4); while (<BEDPE>) { ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_); if(!$Echr1){ ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_); } while ( ($chr1 =~ /^$Echr1$/)&& ($pos2 <= $Epos2+$window)&& ($chr2 =~ /^$Echr2$/)&& ($pos3 <= $Epos3+$window) ) {$nextline = <BEDPE> ; last if (!$nextline); $nextline=~chomp; ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline); #print "NEXTLINE=$nextline"; push (@chr1,$chr1); push (@pos1,$pos1); push (@pos2,$pos2); push (@chr2,$chr2); push (@pos3,$pos3); push (@pos4,$pos4); } ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]); #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4); if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp} if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp} print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n"; print OUT2 join ("\t",$chr2,$pos3,$pos4); ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4); } close BEDPE; close OUT; unlink ($tmpSpace2); ##################################################################### #Now find out for each Unique BEDPE, how many Samples was the SV in? ##################################################################### #FOR EACH VCF #get NAME my $tmpSpace5=join("",$tmpSpace,".5"); my $tmpSpace6=join("",$tmpSpace,".6"); my $tmpSpace7=join("",$tmpSpace,".7"); my $tmpSpace8=join("",$tmpSpace,".8"); my $tmpSpace9=join("",$tmpSpace,".9"); #Create a placeholder file system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7"); #Convert the VCF into a BED PE for (my $i=0;$i<@VCF;$i++){ open (OUT,">$tmpSpace5") or die "Cant write in this directory\n"; open(VCF,$VCF[$i]) ; print STDERR "Starting on $VCF[$i]\n"; while (<VCF>) { next if ($_=~/^#/); chomp; @line=split("\t",$_); $mate=$line[4]; $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; @mate=split(/:/,$mate); $end1a=$line[1]-$window; $end1b=$line[1]+$window; $end2a=$mate[1]-$window; $end2b=$mate[1]+$window; next if (($end1a<0)||($end2a<0)); if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { #print "$_\n"; next; } print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; } close VCF; close OUT; #for each row in $tmpSpace3, count the number of overlaps on both sides my $left=join("",$tmpSpace,".left"); my $right=join("",$tmpSpace,".right"); system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left"); system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right"); my $Lcount=`wc -l $left|cut -f1 -d" "`; my $Rcount=`wc -l $right|cut -f1 -d" "`; if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"} system("paste $left $right > $tmpSpace5"); system ("rm $left $right"); open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n"; open (OUT,">$tmpSpace6") or die "Cant write in this directory\n"; while(<IN>){ $_=~chomp; @lines=split("\t",$_); if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"} } close IN; close OUT; system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8"); #system("head $tmpSpace7 $tmpSpace8"); open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n"; open (OUT,">$tmpSpace9") or die "Cant write in this directory\n"; my ($Samples,$NumSamples,$EVENT); while(<IN>){ $_=~chomp; @lines=split("\t",$_); if ($lines[8] > 0){ $Samples=$lines[7].";".$lines[9]; $Samples=~s/^NA;//; $NumSamples=$lines[6]+$lines[8]; } else{ $Samples=$lines[7]; $NumSamples=$lines[6]; } print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n"; } close IN; close OUT; print STDERR "completed with $VCF[$i]\n"; system("cp $tmpSpace9 $tmpSpace7"); } system("cp $tmpSpace7 $outFile"); unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6); print STDERR "Your results are in $outFile\n"; sub usage(){ print " ### ### This script will merge multiple SoftSearch VCF files ### Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file> Note: Must have bedtools installed and in your path\n\n\n"; }