Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison 2.4/script/Merge_SV.pl @ 16:8eb7d93f7e58 draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Sat, 31 May 2014 11:23:36 -0400 |
parents | e3609c8714fb |
children |
comparison
equal
deleted
inserted
replaced
15:da93b6f4d684 | 16:8eb7d93f7e58 |
---|---|
1 #!/usr/bin/perl -w | |
2 use Getopt::Long; | |
3 use List::Util qw(min max); | |
4 | |
5 | |
6 #Declare variables | |
7 my ($window,$tmpSpace,$usage,$help,$outFile); | |
8 | |
9 GetOptions( | |
10 'v=s{2,}' => \@VCF, | |
11 'o:s' => \$outFile, | |
12 'w:s' => \$window, | |
13 'h|help' => \$help | |
14 ); | |
15 | |
16 if((!@VCF)||($help)){&usage();exit} | |
17 | |
18 | |
19 if (!$window) { | |
20 $window=500; | |
21 } | |
22 if (!$outFile) { | |
23 $outFile="merged.vcf.out"; | |
24 } | |
25 ########################################### | |
26 # Protect against merging too many results | |
27 ########################################### | |
28 $tmpSpace='temporarySV_merge'; | |
29 if (-e $tmpSpace) { | |
30 #Delete temp file if it exists | |
31 unlink $tmpSpace; | |
32 } | |
33 ########################################### | |
34 #For each VCF, create a BEDPE file | |
35 ########################################### | |
36 | |
37 open(OUT,">>$tmpSpace") or die "Can't write in this directory\n"; | |
38 for (my $i=0;$i<@VCF;$i++){ | |
39 #print STDERR "opening $VCF[$i]\n"; | |
40 open(VCF,$VCF[$i]) or die &usage(); | |
41 while (<VCF>) { | |
42 next if ($_=~/^#/); | |
43 chomp; | |
44 @line=split("\t",$_); | |
45 $mate=$line[4]; | |
46 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; | |
47 @mate=split(/:/,$mate); | |
48 $end1a=$line[1]-$window; | |
49 $end1b=$line[1]+$window; | |
50 $end2a=$mate[1]-$window; | |
51 $end2b=$mate[1]+$window; | |
52 next if (($end1a<0)||($end2a<0)); | |
53 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { | |
54 next; | |
55 } | |
56 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; | |
57 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; | |
58 } | |
59 } | |
60 close OUT; | |
61 | |
62 ########################################### | |
63 #Now merge the BEDPE into a unique BEDPE | |
64 ########################################### | |
65 #Make sure the BEDPE is sorted | |
66 #print "Make sure the BEDPE is sorted\n"; | |
67 my $tmpSpace2=join("",$tmpSpace,".2"); | |
68 system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2"); | |
69 unlink($tmpSpace); | |
70 | |
71 #Create output files for the left and right merged BEDPE | |
72 my $tmpSpace3=join("",$tmpSpace,".3"); | |
73 my $tmpSpace4=join("",$tmpSpace,".4"); | |
74 | |
75 open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n"; | |
76 open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n"; | |
77 | |
78 open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n"; | |
79 #Initialize positions | |
80 #my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4); | |
81 my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4); | |
82 while (<BEDPE>) { | |
83 ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_); | |
84 if(!$Echr1){ | |
85 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_); | |
86 } | |
87 while ( | |
88 ($chr1 =~ /^$Echr1$/)&& | |
89 ($pos2 <= $Epos2+$window)&& | |
90 ($chr2 =~ /^$Echr2$/)&& | |
91 ($pos3 <= $Epos3+$window) | |
92 ) | |
93 {$nextline = <BEDPE> ; | |
94 last if (!$nextline); | |
95 $nextline=~chomp; | |
96 ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline); | |
97 #print "NEXTLINE=$nextline"; | |
98 push (@chr1,$chr1); | |
99 push (@pos1,$pos1); | |
100 push (@pos2,$pos2); | |
101 push (@chr2,$chr2); | |
102 push (@pos3,$pos3); | |
103 push (@pos4,$pos4); | |
104 } | |
105 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]); | |
106 #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4); | |
107 if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp} | |
108 if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp} | |
109 print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n"; | |
110 print OUT2 join ("\t",$chr2,$pos3,$pos4); | |
111 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4); | |
112 } | |
113 close BEDPE; | |
114 close OUT; | |
115 unlink ($tmpSpace2); | |
116 | |
117 ##################################################################### | |
118 #Now find out for each Unique BEDPE, how many Samples was the SV in? | |
119 ##################################################################### | |
120 #FOR EACH VCF | |
121 #get NAME | |
122 | |
123 my $tmpSpace5=join("",$tmpSpace,".5"); | |
124 my $tmpSpace6=join("",$tmpSpace,".6"); | |
125 my $tmpSpace7=join("",$tmpSpace,".7"); | |
126 my $tmpSpace8=join("",$tmpSpace,".8"); | |
127 my $tmpSpace9=join("",$tmpSpace,".9"); | |
128 | |
129 #Create a placeholder file | |
130 system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7"); | |
131 #Convert the VCF into a BED PE | |
132 for (my $i=0;$i<@VCF;$i++){ | |
133 open (OUT,">$tmpSpace5") or die "Cant write in this directory\n"; | |
134 open(VCF,$VCF[$i]) ; | |
135 print STDERR "Starting on $VCF[$i]\n"; | |
136 while (<VCF>) { | |
137 next if ($_=~/^#/); | |
138 chomp; | |
139 @line=split("\t",$_); | |
140 $mate=$line[4]; | |
141 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; | |
142 @mate=split(/:/,$mate); | |
143 $end1a=$line[1]-$window; | |
144 $end1b=$line[1]+$window; | |
145 $end2a=$mate[1]-$window; | |
146 $end2b=$mate[1]+$window; | |
147 next if (($end1a<0)||($end2a<0)); | |
148 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { | |
149 #print "$_\n"; | |
150 next; | |
151 } | |
152 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; | |
153 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; | |
154 } | |
155 close VCF; | |
156 close OUT; | |
157 #for each row in $tmpSpace3, count the number of overlaps on both sides | |
158 my $left=join("",$tmpSpace,".left"); | |
159 my $right=join("",$tmpSpace,".right"); | |
160 system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left"); | |
161 system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right"); | |
162 | |
163 my $Lcount=`wc -l $left|cut -f1 -d" "`; | |
164 my $Rcount=`wc -l $right|cut -f1 -d" "`; | |
165 if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"} | |
166 system("paste $left $right > $tmpSpace5"); | |
167 system ("rm $left $right"); | |
168 open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n"; | |
169 open (OUT,">$tmpSpace6") or die "Cant write in this directory\n"; | |
170 while(<IN>){ | |
171 $_=~chomp; | |
172 @lines=split("\t",$_); | |
173 if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"} | |
174 } | |
175 close IN; | |
176 close OUT; | |
177 | |
178 system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8"); | |
179 #system("head $tmpSpace7 $tmpSpace8"); | |
180 open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n"; | |
181 open (OUT,">$tmpSpace9") or die "Cant write in this directory\n"; | |
182 my ($Samples,$NumSamples,$EVENT); | |
183 while(<IN>){ | |
184 $_=~chomp; | |
185 @lines=split("\t",$_); | |
186 | |
187 if ($lines[8] > 0){ | |
188 $Samples=$lines[7].";".$lines[9]; | |
189 $Samples=~s/^NA;//; | |
190 $NumSamples=$lines[6]+$lines[8]; | |
191 } | |
192 else{ | |
193 $Samples=$lines[7]; | |
194 $NumSamples=$lines[6]; | |
195 } | |
196 print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n"; | |
197 } | |
198 close IN; | |
199 close OUT; | |
200 print STDERR "completed with $VCF[$i]\n"; | |
201 system("cp $tmpSpace9 $tmpSpace7"); | |
202 } | |
203 | |
204 system("cp $tmpSpace7 $outFile"); | |
205 unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6); | |
206 print STDERR "Your results are in $outFile\n"; | |
207 | |
208 | |
209 sub usage(){ | |
210 print " | |
211 ### | |
212 ### This script will merge multiple SoftSearch VCF files | |
213 ### | |
214 | |
215 Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file> | |
216 | |
217 Note: Must have bedtools installed and in your path\n\n\n"; | |
218 } |