diff 2.4/src/Merge_SV.pl @ 16:8eb7d93f7e58 draft

Uploaded
author plus91-technologies-pvt-ltd
date Sat, 31 May 2014 11:23:36 -0400
parents e3609c8714fb
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/2.4/src/Merge_SV.pl	Sat May 31 11:23:36 2014 -0400
@@ -0,0 +1,218 @@
+#!/usr/bin/perl -w
+use Getopt::Long;
+use List::Util qw(min max);
+
+
+#Declare variables
+my ($window,$tmpSpace,$usage,$help,$outFile);
+
+GetOptions(
+        'v=s{2,}' => \@VCF,
+        'o:s' => \$outFile,
+        'w:s' => \$window,
+		'h|help' => \$help
+);
+
+if((!@VCF)||($help)){&usage();exit}
+
+
+if (!$window) {
+    $window=500;
+}
+if (!$outFile) {
+    $outFile="merged.vcf.out";
+}
+###########################################
+# Protect against merging too many results
+###########################################
+$tmpSpace='temporarySV_merge';
+if (-e $tmpSpace) {
+    #Delete temp file if it exists
+    unlink $tmpSpace;
+}
+###########################################
+#For each VCF, create a BEDPE file
+###########################################
+
+open(OUT,">>$tmpSpace") or die "Can't write in this directory\n";
+for (my $i=0;$i<@VCF;$i++){
+    #print STDERR "opening $VCF[$i]\n";
+    open(VCF,$VCF[$i]) or die &usage();
+    while (<VCF>) {
+        next if ($_=~/^#/);
+        chomp;
+        @line=split("\t",$_);
+        $mate=$line[4];
+        $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
+        @mate=split(/:/,$mate);
+        $end1a=$line[1]-$window;
+        $end1b=$line[1]+$window;
+        $end2a=$mate[1]-$window;
+        $end2b=$mate[1]+$window;
+        next if (($end1a<0)||($end2a<0));
+        if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
+            next;
+        }
+        print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
+        print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
+    }
+}
+close OUT;
+
+###########################################
+#Now merge the BEDPE into a unique BEDPE
+###########################################
+#Make sure the BEDPE is sorted
+#print "Make sure the BEDPE is sorted\n";
+my $tmpSpace2=join("",$tmpSpace,".2");
+system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2");
+unlink($tmpSpace);
+
+#Create output files for the left and right merged BEDPE
+my $tmpSpace3=join("",$tmpSpace,".3");
+my $tmpSpace4=join("",$tmpSpace,".4");
+
+open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n";
+open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n";
+
+open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n";
+#Initialize positions
+#my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4);
+my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4);
+while (<BEDPE>) {
+    ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_);
+	if(!$Echr1){
+	($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_);
+	}
+    while ( 
+		 ($chr1 =~ /^$Echr1$/)&&
+           ($pos2 <= $Epos2+$window)&&
+            ($chr2 =~ /^$Echr2$/)&&
+           ($pos3 <= $Epos3+$window)
+          )
+        {$nextline = <BEDPE> ;
+		last if (!$nextline);
+		$nextline=~chomp;
+         ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline);
+		 #print "NEXTLINE=$nextline";
+         push (@chr1,$chr1);
+         push (@pos1,$pos1);
+         push (@pos2,$pos2);
+         push (@chr2,$chr2);
+         push (@pos3,$pos3);
+         push (@pos4,$pos4);   
+		  }
+    ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]);
+    #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4);
+	if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp}
+	if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp}
+	print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n";
+	print OUT2 join ("\t",$chr2,$pos3,$pos4);
+	($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4);
+	}
+close BEDPE;
+close OUT;
+unlink ($tmpSpace2);
+
+#####################################################################
+#Now find out for each Unique BEDPE, how many Samples was the SV in?
+#####################################################################
+#FOR EACH VCF
+#get NAME
+
+my $tmpSpace5=join("",$tmpSpace,".5");
+my $tmpSpace6=join("",$tmpSpace,".6");
+my $tmpSpace7=join("",$tmpSpace,".7");
+my $tmpSpace8=join("",$tmpSpace,".8");
+my $tmpSpace9=join("",$tmpSpace,".9");
+
+#Create a placeholder file
+system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7");
+#Convert the VCF into a BED PE
+for (my $i=0;$i<@VCF;$i++){
+	open (OUT,">$tmpSpace5") or die "Cant write in this directory\n";
+	open(VCF,$VCF[$i]) ;
+	print STDERR "Starting on $VCF[$i]\n";
+		while (<VCF>) {
+			next if ($_=~/^#/);
+			chomp;
+			@line=split("\t",$_);
+			$mate=$line[4];
+			$mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
+			@mate=split(/:/,$mate);
+			$end1a=$line[1]-$window;
+			$end1b=$line[1]+$window;
+			$end2a=$mate[1]-$window;
+			$end2b=$mate[1]+$window;
+			next if (($end1a<0)||($end2a<0));
+			if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
+				#print "$_\n";
+				next;
+			}
+			print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
+			print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
+		}
+	close VCF;
+	close OUT;
+	#for each row in $tmpSpace3, count the number of overlaps on both sides
+	my $left=join("",$tmpSpace,".left");
+	my $right=join("",$tmpSpace,".right");
+	system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left");
+	system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right");
+
+	my $Lcount=`wc -l $left|cut -f1 -d" "`;
+	my $Rcount=`wc -l $right|cut -f1 -d" "`;
+	if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"}
+	system("paste $left $right > $tmpSpace5");
+	system ("rm $left $right");
+	open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n";
+	open (OUT,">$tmpSpace6") or die "Cant write in this directory\n";
+	while(<IN>){
+		$_=~chomp;
+		@lines=split("\t",$_);
+		if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"}
+		}
+	close IN;
+	close OUT;
+
+	system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8");
+	#system("head $tmpSpace7 $tmpSpace8");
+	 open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n";
+	 open (OUT,">$tmpSpace9") or die "Cant write in this directory\n";
+	 my ($Samples,$NumSamples,$EVENT);
+	 while(<IN>){
+		 $_=~chomp;
+		 @lines=split("\t",$_);
+
+		 if ($lines[8] > 0){
+			$Samples=$lines[7].";".$lines[9];
+			$Samples=~s/^NA;//;
+			$NumSamples=$lines[6]+$lines[8];
+			}
+			else{
+			$Samples=$lines[7];
+			$NumSamples=$lines[6];
+			}
+			print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n";
+	 }
+	 close IN;
+	 close OUT;
+	 print STDERR "completed with $VCF[$i]\n";
+	 system("cp $tmpSpace9 $tmpSpace7");
+}
+
+system("cp $tmpSpace7 $outFile");
+unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6);
+print STDERR "Your results are in $outFile\n";
+
+
+sub usage(){
+    print "
+###
+### This script will merge multiple SoftSearch VCF files
+###
+
+Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file>
+   
+    Note: Must have bedtools installed and in your path\n\n\n";
+}