comparison 2.4/src/Merge_SV.pl @ 13:e3609c8714fb draft

Uploaded
author plus91-technologies-pvt-ltd
date Fri, 30 May 2014 03:37:55 -0400
parents
children
comparison
equal deleted inserted replaced
12:980fce54e60a 13:e3609c8714fb
1 #!/usr/bin/perl -w
2 use Getopt::Long;
3 use List::Util qw(min max);
4
5
6 #Declare variables
7 my ($window,$tmpSpace,$usage,$help,$outFile);
8
9 GetOptions(
10 'v=s{2,}' => \@VCF,
11 'o:s' => \$outFile,
12 'w:s' => \$window,
13 'h|help' => \$help
14 );
15
16 if((!@VCF)||($help)){&usage();exit}
17
18
19 if (!$window) {
20 $window=500;
21 }
22 if (!$outFile) {
23 $outFile="merged.vcf.out";
24 }
25 ###########################################
26 # Protect against merging too many results
27 ###########################################
28 $tmpSpace='temporarySV_merge';
29 if (-e $tmpSpace) {
30 #Delete temp file if it exists
31 unlink $tmpSpace;
32 }
33 ###########################################
34 #For each VCF, create a BEDPE file
35 ###########################################
36
37 open(OUT,">>$tmpSpace") or die "Can't write in this directory\n";
38 for (my $i=0;$i<@VCF;$i++){
39 #print STDERR "opening $VCF[$i]\n";
40 open(VCF,$VCF[$i]) or die &usage();
41 while (<VCF>) {
42 next if ($_=~/^#/);
43 chomp;
44 @line=split("\t",$_);
45 $mate=$line[4];
46 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
47 @mate=split(/:/,$mate);
48 $end1a=$line[1]-$window;
49 $end1b=$line[1]+$window;
50 $end2a=$mate[1]-$window;
51 $end2b=$mate[1]+$window;
52 next if (($end1a<0)||($end2a<0));
53 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
54 next;
55 }
56 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
57 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
58 }
59 }
60 close OUT;
61
62 ###########################################
63 #Now merge the BEDPE into a unique BEDPE
64 ###########################################
65 #Make sure the BEDPE is sorted
66 #print "Make sure the BEDPE is sorted\n";
67 my $tmpSpace2=join("",$tmpSpace,".2");
68 system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2");
69 unlink($tmpSpace);
70
71 #Create output files for the left and right merged BEDPE
72 my $tmpSpace3=join("",$tmpSpace,".3");
73 my $tmpSpace4=join("",$tmpSpace,".4");
74
75 open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n";
76 open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n";
77
78 open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n";
79 #Initialize positions
80 #my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4);
81 my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4);
82 while (<BEDPE>) {
83 ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_);
84 if(!$Echr1){
85 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_);
86 }
87 while (
88 ($chr1 =~ /^$Echr1$/)&&
89 ($pos2 <= $Epos2+$window)&&
90 ($chr2 =~ /^$Echr2$/)&&
91 ($pos3 <= $Epos3+$window)
92 )
93 {$nextline = <BEDPE> ;
94 last if (!$nextline);
95 $nextline=~chomp;
96 ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline);
97 #print "NEXTLINE=$nextline";
98 push (@chr1,$chr1);
99 push (@pos1,$pos1);
100 push (@pos2,$pos2);
101 push (@chr2,$chr2);
102 push (@pos3,$pos3);
103 push (@pos4,$pos4);
104 }
105 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]);
106 #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4);
107 if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp}
108 if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp}
109 print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n";
110 print OUT2 join ("\t",$chr2,$pos3,$pos4);
111 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4);
112 }
113 close BEDPE;
114 close OUT;
115 unlink ($tmpSpace2);
116
117 #####################################################################
118 #Now find out for each Unique BEDPE, how many Samples was the SV in?
119 #####################################################################
120 #FOR EACH VCF
121 #get NAME
122
123 my $tmpSpace5=join("",$tmpSpace,".5");
124 my $tmpSpace6=join("",$tmpSpace,".6");
125 my $tmpSpace7=join("",$tmpSpace,".7");
126 my $tmpSpace8=join("",$tmpSpace,".8");
127 my $tmpSpace9=join("",$tmpSpace,".9");
128
129 #Create a placeholder file
130 system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7");
131 #Convert the VCF into a BED PE
132 for (my $i=0;$i<@VCF;$i++){
133 open (OUT,">$tmpSpace5") or die "Cant write in this directory\n";
134 open(VCF,$VCF[$i]) ;
135 print STDERR "Starting on $VCF[$i]\n";
136 while (<VCF>) {
137 next if ($_=~/^#/);
138 chomp;
139 @line=split("\t",$_);
140 $mate=$line[4];
141 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
142 @mate=split(/:/,$mate);
143 $end1a=$line[1]-$window;
144 $end1b=$line[1]+$window;
145 $end2a=$mate[1]-$window;
146 $end2b=$mate[1]+$window;
147 next if (($end1a<0)||($end2a<0));
148 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
149 #print "$_\n";
150 next;
151 }
152 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
153 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
154 }
155 close VCF;
156 close OUT;
157 #for each row in $tmpSpace3, count the number of overlaps on both sides
158 my $left=join("",$tmpSpace,".left");
159 my $right=join("",$tmpSpace,".right");
160 system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left");
161 system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right");
162
163 my $Lcount=`wc -l $left|cut -f1 -d" "`;
164 my $Rcount=`wc -l $right|cut -f1 -d" "`;
165 if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"}
166 system("paste $left $right > $tmpSpace5");
167 system ("rm $left $right");
168 open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n";
169 open (OUT,">$tmpSpace6") or die "Cant write in this directory\n";
170 while(<IN>){
171 $_=~chomp;
172 @lines=split("\t",$_);
173 if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"}
174 }
175 close IN;
176 close OUT;
177
178 system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8");
179 #system("head $tmpSpace7 $tmpSpace8");
180 open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n";
181 open (OUT,">$tmpSpace9") or die "Cant write in this directory\n";
182 my ($Samples,$NumSamples,$EVENT);
183 while(<IN>){
184 $_=~chomp;
185 @lines=split("\t",$_);
186
187 if ($lines[8] > 0){
188 $Samples=$lines[7].";".$lines[9];
189 $Samples=~s/^NA;//;
190 $NumSamples=$lines[6]+$lines[8];
191 }
192 else{
193 $Samples=$lines[7];
194 $NumSamples=$lines[6];
195 }
196 print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n";
197 }
198 close IN;
199 close OUT;
200 print STDERR "completed with $VCF[$i]\n";
201 system("cp $tmpSpace9 $tmpSpace7");
202 }
203
204 system("cp $tmpSpace7 $outFile");
205 unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6);
206 print STDERR "Your results are in $outFile\n";
207
208
209 sub usage(){
210 print "
211 ###
212 ### This script will merge multiple SoftSearch VCF files
213 ###
214
215 Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file>
216
217 Note: Must have bedtools installed and in your path\n\n\n";
218 }