13
|
1 #!/usr/bin/perl -w
|
|
2 use Getopt::Long;
|
|
3 use List::Util qw(min max);
|
|
4
|
|
5
|
|
6 #Declare variables
|
|
7 my ($window,$tmpSpace,$usage,$help,$outFile);
|
|
8
|
|
9 GetOptions(
|
|
10 'v=s{2,}' => \@VCF,
|
|
11 'o:s' => \$outFile,
|
|
12 'w:s' => \$window,
|
|
13 'h|help' => \$help
|
|
14 );
|
|
15
|
|
16 if((!@VCF)||($help)){&usage();exit}
|
|
17
|
|
18
|
|
19 if (!$window) {
|
|
20 $window=500;
|
|
21 }
|
|
22 if (!$outFile) {
|
|
23 $outFile="merged.vcf.out";
|
|
24 }
|
|
25 ###########################################
|
|
26 # Protect against merging too many results
|
|
27 ###########################################
|
|
28 $tmpSpace='temporarySV_merge';
|
|
29 if (-e $tmpSpace) {
|
|
30 #Delete temp file if it exists
|
|
31 unlink $tmpSpace;
|
|
32 }
|
|
33 ###########################################
|
|
34 #For each VCF, create a BEDPE file
|
|
35 ###########################################
|
|
36
|
|
37 open(OUT,">>$tmpSpace") or die "Can't write in this directory\n";
|
|
38 for (my $i=0;$i<@VCF;$i++){
|
|
39 #print STDERR "opening $VCF[$i]\n";
|
|
40 open(VCF,$VCF[$i]) or die &usage();
|
|
41 while (<VCF>) {
|
|
42 next if ($_=~/^#/);
|
|
43 chomp;
|
|
44 @line=split("\t",$_);
|
|
45 $mate=$line[4];
|
|
46 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
|
|
47 @mate=split(/:/,$mate);
|
|
48 $end1a=$line[1]-$window;
|
|
49 $end1b=$line[1]+$window;
|
|
50 $end2a=$mate[1]-$window;
|
|
51 $end2b=$mate[1]+$window;
|
|
52 next if (($end1a<0)||($end2a<0));
|
|
53 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
|
|
54 next;
|
|
55 }
|
|
56 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
|
|
57 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
|
|
58 }
|
|
59 }
|
|
60 close OUT;
|
|
61
|
|
62 ###########################################
|
|
63 #Now merge the BEDPE into a unique BEDPE
|
|
64 ###########################################
|
|
65 #Make sure the BEDPE is sorted
|
|
66 #print "Make sure the BEDPE is sorted\n";
|
|
67 my $tmpSpace2=join("",$tmpSpace,".2");
|
|
68 system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2");
|
|
69 unlink($tmpSpace);
|
|
70
|
|
71 #Create output files for the left and right merged BEDPE
|
|
72 my $tmpSpace3=join("",$tmpSpace,".3");
|
|
73 my $tmpSpace4=join("",$tmpSpace,".4");
|
|
74
|
|
75 open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n";
|
|
76 open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n";
|
|
77
|
|
78 open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n";
|
|
79 #Initialize positions
|
|
80 #my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4);
|
|
81 my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4);
|
|
82 while (<BEDPE>) {
|
|
83 ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_);
|
|
84 if(!$Echr1){
|
|
85 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_);
|
|
86 }
|
|
87 while (
|
|
88 ($chr1 =~ /^$Echr1$/)&&
|
|
89 ($pos2 <= $Epos2+$window)&&
|
|
90 ($chr2 =~ /^$Echr2$/)&&
|
|
91 ($pos3 <= $Epos3+$window)
|
|
92 )
|
|
93 {$nextline = <BEDPE> ;
|
|
94 last if (!$nextline);
|
|
95 $nextline=~chomp;
|
|
96 ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline);
|
|
97 #print "NEXTLINE=$nextline";
|
|
98 push (@chr1,$chr1);
|
|
99 push (@pos1,$pos1);
|
|
100 push (@pos2,$pos2);
|
|
101 push (@chr2,$chr2);
|
|
102 push (@pos3,$pos3);
|
|
103 push (@pos4,$pos4);
|
|
104 }
|
|
105 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]);
|
|
106 #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4);
|
|
107 if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp}
|
|
108 if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp}
|
|
109 print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n";
|
|
110 print OUT2 join ("\t",$chr2,$pos3,$pos4);
|
|
111 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4);
|
|
112 }
|
|
113 close BEDPE;
|
|
114 close OUT;
|
|
115 unlink ($tmpSpace2);
|
|
116
|
|
117 #####################################################################
|
|
118 #Now find out for each Unique BEDPE, how many Samples was the SV in?
|
|
119 #####################################################################
|
|
120 #FOR EACH VCF
|
|
121 #get NAME
|
|
122
|
|
123 my $tmpSpace5=join("",$tmpSpace,".5");
|
|
124 my $tmpSpace6=join("",$tmpSpace,".6");
|
|
125 my $tmpSpace7=join("",$tmpSpace,".7");
|
|
126 my $tmpSpace8=join("",$tmpSpace,".8");
|
|
127 my $tmpSpace9=join("",$tmpSpace,".9");
|
|
128
|
|
129 #Create a placeholder file
|
|
130 system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7");
|
|
131 #Convert the VCF into a BED PE
|
|
132 for (my $i=0;$i<@VCF;$i++){
|
|
133 open (OUT,">$tmpSpace5") or die "Cant write in this directory\n";
|
|
134 open(VCF,$VCF[$i]) ;
|
|
135 print STDERR "Starting on $VCF[$i]\n";
|
|
136 while (<VCF>) {
|
|
137 next if ($_=~/^#/);
|
|
138 chomp;
|
|
139 @line=split("\t",$_);
|
|
140 $mate=$line[4];
|
|
141 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g;
|
|
142 @mate=split(/:/,$mate);
|
|
143 $end1a=$line[1]-$window;
|
|
144 $end1b=$line[1]+$window;
|
|
145 $end2a=$mate[1]-$window;
|
|
146 $end2b=$mate[1]+$window;
|
|
147 next if (($end1a<0)||($end2a<0));
|
|
148 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) {
|
|
149 #print "$_\n";
|
|
150 next;
|
|
151 }
|
|
152 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n";
|
|
153 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n";
|
|
154 }
|
|
155 close VCF;
|
|
156 close OUT;
|
|
157 #for each row in $tmpSpace3, count the number of overlaps on both sides
|
|
158 my $left=join("",$tmpSpace,".left");
|
|
159 my $right=join("",$tmpSpace,".right");
|
|
160 system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left");
|
|
161 system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right");
|
|
162
|
|
163 my $Lcount=`wc -l $left|cut -f1 -d" "`;
|
|
164 my $Rcount=`wc -l $right|cut -f1 -d" "`;
|
|
165 if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"}
|
|
166 system("paste $left $right > $tmpSpace5");
|
|
167 system ("rm $left $right");
|
|
168 open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n";
|
|
169 open (OUT,">$tmpSpace6") or die "Cant write in this directory\n";
|
|
170 while(<IN>){
|
|
171 $_=~chomp;
|
|
172 @lines=split("\t",$_);
|
|
173 if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"}
|
|
174 }
|
|
175 close IN;
|
|
176 close OUT;
|
|
177
|
|
178 system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8");
|
|
179 #system("head $tmpSpace7 $tmpSpace8");
|
|
180 open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n";
|
|
181 open (OUT,">$tmpSpace9") or die "Cant write in this directory\n";
|
|
182 my ($Samples,$NumSamples,$EVENT);
|
|
183 while(<IN>){
|
|
184 $_=~chomp;
|
|
185 @lines=split("\t",$_);
|
|
186
|
|
187 if ($lines[8] > 0){
|
|
188 $Samples=$lines[7].";".$lines[9];
|
|
189 $Samples=~s/^NA;//;
|
|
190 $NumSamples=$lines[6]+$lines[8];
|
|
191 }
|
|
192 else{
|
|
193 $Samples=$lines[7];
|
|
194 $NumSamples=$lines[6];
|
|
195 }
|
|
196 print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n";
|
|
197 }
|
|
198 close IN;
|
|
199 close OUT;
|
|
200 print STDERR "completed with $VCF[$i]\n";
|
|
201 system("cp $tmpSpace9 $tmpSpace7");
|
|
202 }
|
|
203
|
|
204 system("cp $tmpSpace7 $outFile");
|
|
205 unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6);
|
|
206 print STDERR "Your results are in $outFile\n";
|
|
207
|
|
208
|
|
209 sub usage(){
|
|
210 print "
|
|
211 ###
|
|
212 ### This script will merge multiple SoftSearch VCF files
|
|
213 ###
|
|
214
|
|
215 Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file>
|
|
216
|
|
217 Note: Must have bedtools installed and in your path\n\n\n";
|
|
218 }
|