Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison SoftSearch.pl @ 10:b343822022c3 draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Thu, 29 May 2014 08:07:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:4322038cb77f | 10:b343822022c3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 #### | |
4 #### Usage: SoftSearch.pl [-lqrmsd] -b <BAM> -f <Genome.fa> -sam <samtools path> -bed <bedtools path> | |
5 #### Created 1-30-2012 by Steven Hart, PhD | |
6 #### hart.steven@mayo.edu | |
7 #### Required bedtools & samtools to be in path | |
8 | |
9 | |
10 use lib "/home/plus91/2.4/lib" ; | |
11 | |
12 use Getopt::Long; | |
13 use strict; | |
14 use warnings; | |
15 #use Data::Dumper; | |
16 use LevD; | |
17 use File::Basename; | |
18 | |
19 my ($INPUT_BAM,$INPUT_FASTA,$OUTPUT_FILE,$minSoft,$minSoftReads,$dist_To_Soft,$bedtools,$samtools); | |
20 my ($minRP, $temp_output, $num_sd, $MapQ, $chrom, $unmated_pairs, $minBQ, $pair_only, $disable_RP_only); | |
21 my ($levD_local_threshold, $levD_distl_threshold,$pe_upper_limit,$high_qual,$sv_only,$blacklist,$genome_file,$verbose); | |
22 | |
23 my $cmd = ""; | |
24 | |
25 #Declare variables | |
26 GetOptions( | |
27 'b=s' => \$INPUT_BAM, | |
28 'f=s' => \$INPUT_FASTA, | |
29 'o:s' => \$OUTPUT_FILE, | |
30 'm:i' => \$minRP, | |
31 'l:i' => \$minSoft, | |
32 'r:i' => \$minSoftReads, | |
33 't:i' => \$temp_output, | |
34 's:s' => \$num_sd, | |
35 'd:i' => \$dist_To_Soft, | |
36 'q:i' => \$MapQ, | |
37 'c:s' => \$chrom, | |
38 'u:s' => \$unmated_pairs, | |
39 'x:s' => \$minBQ, | |
40 'p' => \$pair_only, | |
41 'g' => \$disable_RP_only, | |
42 'j:s' => \$levD_local_threshold, | |
43 'k:s' => \$levD_distl_threshold, | |
44 'a:s' => \$pe_upper_limit, | |
45 'e:s' => \$high_qual, | |
46 'L' => \$sv_only, | |
47 'v' => \$verbose, | |
48 'blacklist:s' => \$blacklist, | |
49 'genome_file:s' => \$genome_file, | |
50 "help|h|?" => \&usage); | |
51 unless($sv_only){$sv_only=""}; | |
52 if(defined($INPUT_BAM)){$INPUT_BAM=$INPUT_BAM} else {print usage();die "Where is the BAM file?\n\n"} | |
53 if(defined($INPUT_FASTA)){$INPUT_FASTA=$INPUT_FASTA} else {print usage();die "Where is the fasta file?\n\n"} | |
54 my ($fn,$pathname) = fileparse($INPUT_BAM,".bam"); | |
55 my $index=`ls $pathname/$fn*bai|head -1`; | |
56 #my $index =`ls \${INPUT_BAM%.bam}*bai`; | |
57 #print "INDEX=$index\n"; | |
58 if(!$index){die "\n\nERROR: you need index your BAM file\n\n"} | |
59 | |
60 ### get current time | |
61 print "Start Time : " . &spGetCurDateTime() . "\n"; | |
62 my $now = time; | |
63 | |
64 #if(defined($OUTPUT_FILE)){$OUTPUT_FILE=$OUTPUT_FILE} else {$OUTPUT_FILE="output.vcf"; print "\nNo outfile specified. Using output.vcf as default\n\n"} | |
65 if(defined($minSoft)){$minSoft=$minSoft} else {$minSoft=5} | |
66 if(defined($minRP)){$minRP=$minRP} else {$minRP=5} | |
67 if(defined($minSoftReads)){$minSoftReads=$minSoftReads} else {$minSoftReads=5} | |
68 if(defined($dist_To_Soft)){$dist_To_Soft=$dist_To_Soft} else {$dist_To_Soft=300} | |
69 if(defined($num_sd)){$num_sd=$num_sd} else {$num_sd=6} | |
70 if(defined($MapQ)){$MapQ=$MapQ} else {$MapQ=20} | |
71 | |
72 unless (defined $pe_upper_limit) { $pe_upper_limit = 10000; } | |
73 unless (defined $levD_local_threshold) { $levD_local_threshold = 0.05; } | |
74 unless (defined $levD_distl_threshold) { $levD_distl_threshold = 0.05; } | |
75 #Get sample name if available | |
76 my $SAMPLE_NAME=""; | |
77 my $OUTNAME =""; | |
78 $SAMPLE_NAME=`samtools view -f2 -H $INPUT_BAM|awk '{if(\$1~/^\@RG/){sub("ID:","",\$2);name=\$2;print name}}'|head -1`; | |
79 #print $SAMPLE_NAME; | |
80 $SAMPLE_NAME=~s/\n//g; | |
81 #print $SAMPLE_NAME; | |
82 my $increase=rand(); | |
83 my $fname="test"; | |
84 $increase++; | |
85 if (!$OUTPUT_FILE){ | |
86 if($SAMPLE_NAME ne ""){$OUTNAME=$SAMPLE_NAME.".vcf"} | |
87 else {$OUTNAME="output.vcf"} | |
88 } | |
89 else{$OUTNAME=$OUTPUT_FILE} | |
90 | |
91 print "Writing results to $OUTNAME\n"; | |
92 | |
93 | |
94 ##Make sure if submitting on SGE, to prepned the "chr". Not all referecne FAST files require "chr", so we shouldn't force the issue. | |
95 if(!defined($chrom)){$chrom=""} | |
96 if(!defined($unmated_pairs)){$unmated_pairs=0} | |
97 | |
98 my $badQualValue=chr($MapQ); | |
99 if(defined($minBQ)){ $badQualValue=chr($minBQ); } | |
100 | |
101 if($badQualValue eq "#"){$badQualValue="\#"} | |
102 | |
103 # adding and cheking for samtools and bedtools in the PATh | |
104 ## check for bedtools and samtools in the path | |
105 $bedtools=`which intersectBed` ; | |
106 if(!defined($bedtools)){die "\nError:\n\tno bedtools. Please install bedtools and add to the path\n";} | |
107 #$samtools=`samtools 2>&1`; | |
108 $samtools=`which samtools`; | |
109 if($samtools !~ /(samtools)/i){die "\nError:\n\tno samtools. Please install samtools and add to the path\n";} | |
110 | |
111 print "Usage = SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -s $num_sd -c $chrom -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME \n\n"; | |
112 sub usage { | |
113 print "\nusage: SoftSearch.pl [-cqlrmsd] -b <BAM> -f <Genome.fa> \n"; | |
114 print "\t-q\t\tMinimum mapping quality [20]\n"; | |
115 print "\t-l\t\tMinimum length of soft-clipped segment [5]\n"; | |
116 print "\t-r\t\tMinimum depth of soft-clipped reads at position [5]\n"; | |
117 print "\t-m\t\tMinimum number of discordant read pairs [5]\n"; | |
118 print "\t-s\t\tNumber of sd away from mean to be considered discordant [6]\n"; | |
119 print "\t-u\t\tNumber of unmated pairs [0]\n"; | |
120 print "\t-d\t\tMax distance between soft-clipped segments and discordant read pairs [300]\n"; | |
121 print "\t-o\t\tOutput file name [output.vcf]\n"; | |
122 print "\t-t\t\tPrint temp files for debugging [no|yes]\n"; | |
123 print "\t-c\t\tuse only this chrom or chr:pos1-pos2\n"; | |
124 print "\t-p\t\tuse paired-end mode only. In other words, don't try to find soft-clipping events!\n"; | |
125 print "\t-g\t\tEnable paired-only seach. This will look for discordant read pairs even without soft clips.\n"; | |
126 print "\t-a\t\tset the minimum distance for a discordant read pair without soft-clipping info [10000]\n"; | |
127 print "\t-L\t\tFlag to print out even small deletions (low quality)\n"; | |
128 print "\t-e\t\tdisable strict quality filtering of base qualities in soft-clipped reads [no]\n"; | |
129 print "\t-blacklist\tareas of the genome to skip calling. Requires -genome_file\n"; | |
130 print "\t-genome_file\ttab seperated value of chromosome name and length. Only used with -blacklist option\n\n"; | |
131 | |
132 exit 1; | |
133 } | |
134 | |
135 | |
136 ############################################################# | |
137 # create temporary variable name | |
138 ############################################################# | |
139 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`); | |
140 our $random_name=join "", map { ("a".."z")[rand 26] } 1..8; | |
141 | |
142 ############################################################# | |
143 ## create green list | |
144 ############################################################## | |
145 # | |
146 my $new_blacklist=""; | |
147 if($blacklist){ | |
148 if(!$genome_file){die "if using a blacklist, you must also specify the location of a genome_file | |
149 The format of the genome_file should be | |
150 chrom size | |
151 chr1 249250621 | |
152 chr2 243199373 | |
153 ... | |
154 | |
155 If using hg19, you can ge the genome file by | |
156 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from hg19.chromInfo\" > hg19.genome";} | |
157 | |
158 $cmd=join("","complementBed -i $blacklist -g $genome_file >",$random_name,".bed") ; | |
159 system ($cmd); | |
160 $new_blacklist=join(""," -L ",$random_name,".bed "); | |
161 } | |
162 | |
163 if($verbose){print "CMD=$cmd\nBlacklist is $new_blacklist\n";} | |
164 | |
165 | |
166 | |
167 | |
168 | |
169 ############################################################# | |
170 # Calcualte insert size distribution of properly mated reads | |
171 ############################################################# | |
172 | |
173 #Change for compatability with other operating systems | |
174 #my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)**2)}'`; | |
175 | |
176 my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|head -10000|cut -f9|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)^2)}'`; | |
177 #my ($mean,$stdev)=split(/ /,$metrics); | |
178 my ($mean,$stdev)=split(/\s/,$metrics); | |
179 $stdev=~s/\n//; | |
180 my $upper_limit=int($mean+($num_sd*$stdev)); | |
181 my $lower_limit=int($mean-($num_sd*$stdev)); | |
182 die if (!$mean); | |
183 print qq{The mean insert size is $mean +/- $stdev (sd) | |
184 The upper limit = $upper_limit | |
185 The lower limit = $lower_limit\n | |
186 }; | |
187 if($lower_limit<0){ | |
188 print "Warning!! Given this insert size distribution, we can not call small indels. No other data will be affected\n"; | |
189 $lower_limit=1; | |
190 } | |
191 my $tmp_name=join ("",$random_name,".tmp.bam"); | |
192 my $random_file_sc = ""; | |
193 my $command = ""; | |
194 | |
195 ############################################################# | |
196 # Make sam file that has soft clipped reads | |
197 ############################################################# | |
198 #give file a name | |
199 if(!defined($pair_only)){ | |
200 $random_file_sc=join ("",$random_name,".sc.sam"); | |
201 $command=join ("","samtools view -q $MapQ -F 1024 $INPUT_BAM $chrom $new_blacklist| awk '{OFS=\"\\t\"}{c=0;if(\$6~/S/){++c};if(c == 1){print}}' | perl -ane '\$TR=(\@F[10]=~tr/\#//);if(\$TR<2){print}' > ", $random_file_sc); | |
202 | |
203 print "Making SAM file of soft-clipped reads\n"; | |
204 if($verbose){ print "$command\n";} | |
205 system("$command"); | |
206 | |
207 ############################################################# | |
208 # Find areas that have deep enough soft-clip coverage | |
209 print "Identifying soft-clipped regions that are at least $minSoft bp long \n"; | |
210 open (FILE,"$random_file_sc")||die "Can't open soft-clipped sam file $random_file_sc\n"; | |
211 | |
212 my $tmpfile=join("",$random_file_sc,".sc.passfilter"); | |
213 open (OUT,">$tmpfile")||die "Can't write files here!\n"; | |
214 | |
215 while(<FILE>){ | |
216 @_ = split(/\t/, $_); | |
217 #### parse CIGAR string and create a hash of array of each operation | |
218 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]); | |
219 my $hash; | |
220 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR; | |
221 | |
222 #for ($i=0; $i<=$#softclip_pos; $i++) { | |
223 foreach my $softclip (@{$hash->{S}}) { | |
224 #if ($CIGAR[$softclip_pos[$i]] > $minSoft){ | |
225 if ($softclip > $minSoft){ | |
226 ###############Make sure base qualities don't have more than 2 bad marks | |
227 my $qual=$_[10]; | |
228 my $TR=($qual=~tr/$badQualValue//); | |
229 if($badQualValue eq "#"){ $TR=($qual=~tr/\#//); } | |
230 #Skip the soft clip if there is more than 2 bad qual values | |
231 #next if($TR > 2); | |
232 # if (!$high_qual){next if($TR > 2);} | |
233 print OUT; | |
234 last; | |
235 } | |
236 } | |
237 } | |
238 close FILE; | |
239 close OUT; | |
240 | |
241 $command=join(" ","mv",$tmpfile,$random_file_sc); | |
242 if($verbose){ print "$command\n";} | |
243 system("$command"); | |
244 } | |
245 | |
246 ######################################################### | |
247 #Stack up SoftClips | |
248 ######################################################### | |
249 my $random_file=join("",$random_name,".sc.direction.bed"); | |
250 if(!defined($pair_only)){ | |
251 open (FILE,"$random_file_sc")|| die "Can't open sam file\n"; | |
252 #$random_file=join("",$random_name,".sc.direction"); | |
253 | |
254 print "Calling sides of soft-clips\n"; | |
255 #\nTMPOUT=$random_file\tINPUT=$random_file_sc\n\n"; | |
256 open (TMPOUT,">$random_file")|| die "Can't create tmp file\n"; | |
257 | |
258 while (<FILE>){ | |
259 @_ = split(/\t/, $_); | |
260 #### parse CIGAR string and create a hash of array of each operation | |
261 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]); | |
262 my $hash; | |
263 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR; | |
264 | |
265 #### next if softclips on each end | |
266 next if ($_[5] =~ /^[0-9]+S.*S$/); | |
267 | |
268 #### next softclip occurs in the middle | |
269 next if ($_[5] =~ /^[0-9]+[^S][0-9].*S.+$/); | |
270 | |
271 my $softclip = $hash->{S}[0]; | |
272 | |
273 my $end1 = 0; | |
274 my $end2 = 0; | |
275 my $softBases = ""; | |
276 my $right_corrected="";my $left_corrected=""; | |
277 | |
278 if ($softclip > $minSoft) { | |
279 | |
280 ####If the soft clip occurs at end of read and its on the minus strand, then it's a right clip | |
281 if ($_[5] =~ /^.*S$/) { | |
282 $end1=$_[3]+length($_[9])-$softclip-1; | |
283 $end2=$end1+1; | |
284 next if ($end1<0); | |
285 #RIGHT clip on Minus | |
286 $softBases=substr($_[9], length($_[9])-$softclip, length($_[9])); | |
287 #Right clips don't always get clipped correctly, so fix that | |
288 # Check to see if sc base matches ref | |
289 $right_corrected=baseCheck($_[2],$end2,"right",$softBases); | |
290 print TMPOUT "$right_corrected\n" | |
291 | |
292 } else { | |
293 #### Begins with S (left clip) | |
294 $end1=$_[3]-$softclip; | |
295 next if ($end1<0); | |
296 | |
297 $softBases=substr($_[9], 0,$softclip);#print "TMP=$softBases\n"; | |
298 $left_corrected=baseCheck($_[2],$end1,"left",$softBases); | |
299 if(!$left_corrected){print "baseCheck($_[2],$end1,left,$softBases)\n";next} | |
300 print TMPOUT "$left_corrected\n"; | |
301 #print "\nSEQ=$_[9]\t\n"; | |
302 | |
303 } | |
304 } | |
305 } | |
306 close FILE; | |
307 close TMPOUT; | |
308 } | |
309 sub baseCheck{ | |
310 my ($chrom,$pos,$direction,$softBases)=@_; | |
311 #skip if position is less than 0, which is caused by MT DNA | |
312 return if ($pos<0); | |
313 my $exit=""; | |
314 | |
315 while(!$exit){ | |
316 if($direction=~/right/){ | |
317 my $refBase=getSeq($chrom,$pos,$INPUT_FASTA); | |
318 my $softBase=substr($softBases,0,1); | |
319 if ($softBase !~ /$refBase/){ | |
320 my $value=join("\t",$chrom,$pos,$pos+1,join("|",$softBases,$direction)); | |
321 $exit="STOP"; | |
322 return $value; | |
323 } | |
324 else{ | |
325 $pos=$pos+1; | |
326 $softBases=substr($softBases, 1,length($softBases)); | |
327 } | |
328 } | |
329 else{ | |
330 my $refBase=getSeq($chrom,$pos+1,$INPUT_FASTA); | |
331 my $softBase=substr($softBases,-1,1); | |
332 if ($softBase !~ /$refBase/){ | |
333 $pos=$pos-1+length($softBases); | |
334 my $value=join("\t",$chrom,$pos-1,$pos,join("|",$softBases,$direction)); | |
335 $exit="STOP"; | |
336 return $value; | |
337 } | |
338 else{ | |
339 $pos=$pos-1; | |
340 $softBases=substr($softBases, 0, -1); | |
341 #print "Trying again $softBases\n"; | |
342 } | |
343 | |
344 } | |
345 | |
346 } | |
347 } | |
348 #Remove SAM files to conserve space | |
349 unlink($random_file_sc); | |
350 | |
351 | |
352 my $random_file_disc="$INPUT_BAM"; | |
353 ### | |
354 # | |
355 ###################################################### | |
356 # Transform Read pair groups into softclip equivalents | |
357 ###################################################### | |
358 # | |
359 # | |
360 # | |
361 my $v=""; | |
362 #if($disable_RP_only){ | |
363 print "Running Bam2pair.pl\n"; | |
364 print "Looking for discordant read pairs without requiring soft-clipping information\n"; | |
365 use FindBin qw($Bin); | |
366 my $path=$Bin; | |
367 # print"\n\nPATH=$path\n\n"; | |
368 if($verbose){$v="-v"} | |
369 my $tmp_out=join("",$random_name,"RP.out"); | |
370 $command=join("","perl ",$path,"/Bam2pair.pl -b $random_file_disc -o $tmp_out -isize $pe_upper_limit -winsize $dist_To_Soft -min $minRP -chrom $chrom -prefix $random_name -q $MapQ -blacklist $random_name.bed $v"); | |
371 if($verbose){ print "$command\n"}; | |
372 system("$command"); | |
373 $command=join("","perl -ane '\$end1=\@F[1];\$end2=\@F[3];print join(\"\\t\",\@F[0..1],\$end1,\"unknown|left\");print \"\\n\";print join(\"\\t\",\@F[2..3],\$end2,\"unknown|left\");print \"\\n\"' ", $tmp_out," >> ",$random_file); | |
374 if($verbose){print "$command\n"}; | |
375 system($command); | |
376 unlink($tmp_out); | |
377 #} | |
378 # | |
379 | |
380 | |
381 ###################################################### | |
382 unlink("$random_file","$tmp_name","$random_file","$index","$random_name","$new_blacklist") if (-z $random_file || ! -e $random_file ) ; | |
383 if (-z $random_file || ! -e $random_file){ | |
384 print "Softclipped file is empty($random_file).\nNo soft clipping found using desired paramters\n\n"; | |
385 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
386 &print_header(); | |
387 close OUT; | |
388 exit; | |
389 } | |
390 | |
391 | |
392 ############################################################# | |
393 # Make sure there are enough soft-clippped supporting reads | |
394 ############################################################# | |
395 my $outfile=join("",$random_file,".sc.merge.bed"); | |
396 #sortbed -i .sc.direction | mergeBed -nms -d 25 -i stdin > .sc.merge.bed | |
397 $command=join(" ","sortBed -i",$random_file," | mergeBed -nms -i stdin","|egrep \";|,\"","|awk '{OFS=\"\t\"}(NF==4)'",">",$outfile); | |
398 | |
399 print "$command\n" if ($verbose); | |
400 system("$command"); | |
401 | |
402 if (-z $outfile || ! -e $outfile){ | |
403 unlink("$tmp_name","$random_file","$outfile","$index","$random_name","$new_blacklist"); | |
404 print "mergeBed file is empty.\nNo strucutral variants found\n\n" ; | |
405 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
406 &print_header(); | |
407 close OUT; | |
408 exit; | |
409 } | |
410 | |
411 print "completed mergeBed\n"; | |
412 | |
413 ############################################################### | |
414 # If left and right are on the same line, make into 2 lines | |
415 ############################################################### | |
416 open (INFILE,$outfile)||die "couldn't open temp file : $. \n\n"; | |
417 my $tmp2=join("",$random_name,".sc.fixed.merge.bed"); | |
418 #print "INFILE=$outfile\tOUTFILE=$tmp2\n\n"; | |
419 #INPUT FORMAT=chr9\t131467\t131473\tATGCTTATTAAAA|left;TTATTAAAAGCATA|left | |
420 open (OUTFILE,">$tmp2")||die "couldn't create temp file : $. \n\n"; | |
421 while(<INFILE>){ | |
422 chomp $_; | |
423 my $l = $_; | |
424 | |
425 my @a = split(/\t/, $l); | |
426 my $info = $a[3]; | |
427 my @info_arr = split(/\;/, $info); | |
428 my @left_arr=(); | |
429 my @right_arr=(); | |
430 @left_arr = grep(/left/, @info_arr); | |
431 @right_arr = grep(/right/, @info_arr); | |
432 | |
433 #New | |
434 my $left = join(";", @left_arr); | |
435 my $right = join(";", @right_arr); | |
436 $info = join(";", @info_arr); | |
437 | |
438 if((@left_arr) && (@right_arr)){ | |
439 print OUTFILE "$a[0]\t$a[1]\t$a[2]\t$left\n$a[0]\t$a[1]\t$a[2]\t$right\n"; | |
440 } else{ | |
441 my $all=join("\t",@a[0..2],$info); | |
442 print OUTFILE "$all\n"; | |
443 } | |
444 } | |
445 | |
446 # make sure output file name is $outfile | |
447 $command=join(" ","sed -e '/ /s//\t/g'", $tmp2,"|awk 'BEGIN{OFS=\"\\t\"}(NF==4)'", "|perl -pne 's/ /\t/g'>",$outfile); | |
448 system("$command"); | |
449 if($verbose){print "$command\n"}; | |
450 unlink("$tmp_name","$random_file","$tmp2","$outfile","$index","random_name","$new_blacklist") if (-z $outfile || ! -e $outfile) ; | |
451 if (-z $outfile || ! -e $outfile){ | |
452 print "Fixed mergeBed file is empty($outfile).\nNo strucutral variants found\n\n"; | |
453 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
454 &print_header(); | |
455 close OUT; | |
456 exit; | |
457 } | |
458 | |
459 print "completed fixing mergeBed\n\n"; | |
460 | |
461 ############################################################### | |
462 # Seperate directions of soft clips | |
463 ############################################################### | |
464 my $left_sc = join("", "left", $tmp2); | |
465 my $right_sc = join("", "right", $tmp2); | |
466 use FindBin qw($Bin); | |
467 #my $path=$Bin; | |
468 | |
469 $command=join("","grep left ", $tmp2, " |sed -e '/left /s//left\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$left_sc); | |
470 system("$command"); | |
471 #print "$command\n"; | |
472 $command=join("","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$right_sc); | |
473 #$command=join(" ","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g' >",$right_sc); | |
474 system("$command"); | |
475 #print "$command\n"; | |
476 #die "CHECK $right_sc\n"; | |
477 | |
478 ############################################################### | |
479 # Count the number and identify directions of soft clips | |
480 ############################################################### | |
481 print "Count the number and identify directions of soft clips\n"; | |
482 #print "looking in $outfile\n"; | |
483 $outfile=join("",$random_name,".sc.fixed.merge.bed"); | |
484 | |
485 open (INFILE,$outfile)||die "couldn't open temp file\n\n"; | |
486 my $tmp3 = join("", $random_file, "predSV"); | |
487 open (OUTFILE, ">$tmp3")||die "couldn't create temp file\n\n"; | |
488 while(<INFILE>){ | |
489 chomp; | |
490 @_=split(/\t/,$_); | |
491 my $count=tr/\;//;$count+=tr/\,//; | |
492 $count=$count+1; | |
493 my $left=0; | |
494 my $right=0; | |
495 | |
496 while ($_ =~ /left/g) { $left++ } # count number of right clips | |
497 while ($_ =~ /right/g) { $right++ } # count number of left clips | |
498 | |
499 ############################################################### | |
500 if ($count >= $minSoftReads){ | |
501 ####get longets soft-clipped read | |
502 my @clips=split(/\;|,|\|/,$_[3]); | |
503 | |
504 my ($max, $temp, $temp2, $temp3, $dir, $maxSclip) = (0) x 6; | |
505 for (my $i=0; $i<$count; $i++) { | |
506 my $plus1=$i+1; | |
507 $temp=length($clips[$i]); | |
508 $temp2=$clips[$plus1]; | |
509 $temp3=$clips[$i]; | |
510 | |
511 if ($temp > $max){ | |
512 $maxSclip=$temp3; | |
513 $max =$temp; | |
514 $dir=$temp2; | |
515 } else { | |
516 $max=$max; | |
517 $dir=$dir; | |
518 $maxSclip=$maxSclip; | |
519 } | |
520 $i++; | |
521 } | |
522 my $order2 = join("|", $left, $right); | |
523 #print join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n"; | |
524 print OUTFILE join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n"; | |
525 } elsif($_=~/unknown/){ | |
526 print OUTFILE join ("\t",@_[0..2],"NA","NA","left","NA","NA|NA") . "\n"; | |
527 print OUTFILE join ("\t",@_[0..2],"NA","NA","right","NA","NA|NA") . "\n"; | |
528 } | |
529 ####Format is Chrom,start, end,longest Soft-clip,length of longest Soft-clip, direction of longest soft-clip,#supporting softclips,#right Sclips|#left Sclips | |
530 } | |
531 close INFILE; | |
532 close OUTFILE; | |
533 | |
534 unlink("$tmp2","$tmp_name","$random_file","$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$new_blacklist") if (-z $tmp3 || !-e $tmp3) ; | |
535 | |
536 if (-z $tmp3 || !-e $tmp3){ | |
537 print "No structural variants found while Counting the number and identify directions of soft clips.\n" ; | |
538 | |
539 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
540 &print_header(); | |
541 close OUT; | |
542 exit; | |
543 | |
544 } | |
545 | |
546 print "Done counting Softclipped reads\n"; | |
547 ############################################################### | |
548 #### Print header information | |
549 ############################################################### | |
550 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
551 &print_header(); | |
552 close OUT; | |
553 | |
554 ############################################################### | |
555 ############################################################### | |
556 #### DO the bulk of the work | |
557 ############################################################### | |
558 use List::Util qw(min max); | |
559 open (FILE,"$tmp3")|| die "Can't open file\n"; | |
560 open (OUT,">>$OUTNAME")|| die "Can't open file\n"; | |
561 | |
562 #print "\nusing $tmp3 and writing to $OUTPUT_FILE \n"; | |
563 while (<FILE>){ | |
564 #If left clip {+- or -- or -+ }{+- are uninformative b/c they go upstream} | |
565 #If right clip {++ or -- or +-} | |
566 chomp $_; | |
567 my $line = $_; | |
568 my @info = split(/\t/, $_); | |
569 | |
570 if($info[5] eq "left") { | |
571 bulk_work("left", $line, $random_file_disc); | |
572 | |
573 } elsif ($info[5] eq "right") { | |
574 bulk_work("right", $line, $random_file_disc); | |
575 } | |
576 #if($. ==6){print "THIS IS LINE 6\n$_\n";die} | |
577 print "Completed line $.\n" if ($verbose); | |
578 } | |
579 close FILE; | |
580 close OUT; | |
581 | |
582 ############################################################################### | |
583 ############################################################################### | |
584 #### Delete temp files | |
585 my $meregedBed=join("",$random_name,".sc.direction.bed.sc.merge.bed"); | |
586 | |
587 if(defined($temp_output)){$temp_output=$temp_output} else {$temp_output="no"} | |
588 | |
589 if ($temp_output eq "no"){ | |
590 unlink("$tmp_name","$random_file","$tmp2",,"$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$meregedBed","$random_name.bed"); | |
591 } | |
592 ####Sort VCF | |
593 my $tmp=join(".",$random_name,"tmp"); | |
594 #Get header | |
595 $cmd="grep \"#\" $OUTNAME > $tmp"; | |
596 system($cmd); | |
597 #sort results | |
598 $cmd="grep -v \"#\" $OUTNAME|perl -pne 's/chr//'|sort -k1,1n -k2,2n|perl -ne 'print \"chr\".\$_' >>$tmp"; | |
599 system($cmd); | |
600 $cmd="mv $tmp $OUTNAME"; | |
601 system($cmd); | |
602 #remove entries next to each other | |
603 | |
604 | |
605 | |
606 | |
607 ############################################################# | |
608 ##May not need this anymore since filtering on left and right | |
609 ############################################################# | |
610 #my $tmpout=$OUTNAME; | |
611 #$tmpout.=".tmp"; | |
612 #use FindBin qw($Bin); | |
613 ##my $path=$Bin; | |
614 #$command="perl ".$path."/Extract_nSC.pl $OUTNAME -q nSC > $tmpout"; | |
615 ##print "Command=$command\n"; | |
616 #system($command); | |
617 #$command="perl ".$path."/reduce_redundancy.pl $tmpout $upper_limit |cut -f1-10 > $OUTNAME"; | |
618 ##print "$command\n"; | |
619 #system($command); | |
620 #system("rm $tmpout"); | |
621 ######################################################## | |
622 | |
623 | |
624 | |
625 | |
626 print "Analysis Completed\n\nYou did it!!!\n"; | |
627 print "Finish Time : " . &spGetCurDateTime() . "\n"; | |
628 $now = time - $now; | |
629 printf("\n\nTotal running time: %02d:%02d:%02d\n\n", int($now / 3600), int(($now % 3600) / 60), | |
630 int($now % 60)); | |
631 | |
632 exit; | |
633 | |
634 ############################################################################### | |
635 sub rev_comp { | |
636 my $dna = shift; | |
637 my $revcomp = reverse($dna); | |
638 $revcomp =~ tr/ACGTacgt/TGCAtgca/; | |
639 | |
640 return $revcomp; | |
641 } | |
642 | |
643 | |
644 ############################################################################### | |
645 #### to get reference base | |
646 sub getSeq{ | |
647 my ($chr,$pos,$fasta)=@_; | |
648 #don't require chr | |
649 #if($chr !~ /^chr/){die "$chr is not correct\n";} | |
650 # die "$pos is not a number\n" if ($pos <0); | |
651 my @result=(); | |
652 if ($pos <0){print "$pos is not a valid position (likely caused by circular MT chromosome)\n";return;} | |
653 | |
654 @result = `samtools faidx $fasta $chr:$pos-$pos`; | |
655 if($result[1]){chomp($result[1]); | |
656 return uc($result[1]); | |
657 } | |
658 return("NA"); | |
659 #### after return will not be printed | |
660 ####print "RESULTS=@result\n"; | |
661 } | |
662 | |
663 sub getBases{ | |
664 my ($chr,$pos1,$pos2,$fasta)=@_; | |
665 #don't require chr | |
666 #if($chr !~ /^chr/){die "$chr is not correct\n";} | |
667 my @result=(); | |
668 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";return;}; | |
669 | |
670 @result = `samtools faidx $fasta $chr:$pos1-$pos2`; | |
671 if(!$result[1]){$result[1]="NA"}; | |
672 chomp($result[1]); | |
673 return uc($result[1]); | |
674 | |
675 #### after return will not be printed | |
676 ####print "RESULTS=@result\n"; | |
677 } | |
678 ############################################################################### | |
679 #### to get time | |
680 sub spGetCurDateTime { | |
681 my ($sec, $min, $hour, $mday, $mon, $year) = localtime(); | |
682 my $curDateTime = sprintf "%4d-%02d-%02d %02d:%02d:%02d", | |
683 $year+1900, $mon+1, $mday, $hour, $min, $sec; | |
684 return ($curDateTime); | |
685 } | |
686 | |
687 | |
688 ############################################################################### | |
689 #### print header | |
690 sub print_header { | |
691 my $date=&spGetCurDateTime(); | |
692 my $header = qq{##fileformat=VCFv4.1 | |
693 ##fileDate=$date | |
694 ##source=SoftSearch.pl | |
695 ##reference=$INPUT_FASTA | |
696 ##Usage= SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -u $unmated_pairs -s $num_sd -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME | |
697 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> | |
698 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> | |
699 ##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> | |
700 ##INFO=<ID=ISIZE,Number=.,Type=String,Description="Size of the SV"> | |
701 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> | |
702 ##FORMAT=<ID=lSC,Number=1,Type=Integer,Description="Length of the longest soft clips supporting the BND"> | |
703 ##FORMAT=<ID=nSC,Number=1,Type=Integer,Description="Number of supporting soft-clips\"> | |
704 ##FORMAT=<ID=uRP,Number=1,Type=Integer,Description="Number of unmated read pairs nearby Soft-Clips"> | |
705 ##FORMAT=<ID=levD_local,Number=1,Type=Float,Description="Levenstein distance between soft-clipped bases and the area around the original soft-clipped site"> | |
706 ##FORMAT=<ID=levD_distl,Number=1,Type=Float,Description="Levenstein distance between the soft-clipped bases and mate location"> | |
707 ##FORMAT=<ID=CTX,Number=1,Type=Integer,Description="Number of chromosomal translocations"> | |
708 ##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="Number of reads supporting Large Deletions"> | |
709 ##FORMAT=<ID=INS,Number=1,Type=Integer,Description="Number of reads supporting Large insertions"> | |
710 ##FORMAT=<ID=NOV_INS,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion"> | |
711 ##FORMAT=<ID=TDUP,Number=1,Type=Integer,Description="Number of reads supporting a tandem duplication"> | |
712 ##FORMAT=<ID=INV,Number=1,Type=Integer,Description="Number of reads supporting inversions"> | |
713 ##FORMAT=<ID=sDEL,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion"> | |
714 ##INFO=<ID=NO_MATE_SC,Number=1,Type=Flag,Description="When there is no softclipping of the mate read location, an appromiate position is used"> | |
715 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Dummy value for maintaining VCF-Spec"> | |
716 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$SAMPLE_NAME\n}; | |
717 | |
718 print OUT $header; | |
719 } | |
720 | |
721 | |
722 ############################################################################### | |
723 sub bulk_work { | |
724 print "#####################################@_\n" if ($verbose); | |
725 my ($side, $line, $file) = @_; | |
726 my $local_levD = 0; | |
727 my $distl_levD = 0; | |
728 | |
729 #my @info = split(/\t/, $line); | |
730 my @plus_Reads = split(/\t/, $line); | |
731 $plus_Reads[7] =~ s/\n//g; | |
732 | |
733 #### softclip length and softclip size. | |
734 my $lSC = $plus_Reads[4]; | |
735 my $nSC = $plus_Reads[6]; | |
736 | |
737 | |
738 #Get all types of compatible reads | |
739 #Get improperly paired reads (@ max distance) | |
740 | |
741 #### default value for left SIDE. | |
742 #If left-clip, then look downstream for match of softclipped reads to define a deletion, but look for DRPs upstream | |
743 my $sv_type = "SVTYPE=BND"; | |
744 my $start_local=0; my $end_local=0;my $target_local="";my $target_drp="";my $start_drp="";my $end_drp=""; | |
745 if ($side =~ /left/) { | |
746 $start_local = $plus_Reads[1]-$dist_To_Soft; | |
747 $end_local = $plus_Reads[2]; | |
748 $start_drp = $plus_Reads[1]; | |
749 $end_drp = $plus_Reads[1]+$dist_To_Soft; | |
750 | |
751 } | |
752 else{ | |
753 $start_local = $plus_Reads[1]; | |
754 $end_local = $plus_Reads[1]+$dist_To_Soft; | |
755 $start_drp = $plus_Reads[1]-$dist_To_Soft; | |
756 $end_drp = $plus_Reads[1]; | |
757 } | |
758 | |
759 $target_local=join("", $plus_Reads[0], ":", $start_local, "-", $end_local); | |
760 $target_drp=join("", $plus_Reads[0], ":", $start_drp, "-", $end_drp); | |
761 my $num_unmapped_pairs=""; | |
762 if ($side =~ /right/) { | |
763 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f8 -F 1536 -c $INPUT_BAM $target_drp`; | |
764 } else { | |
765 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp`; | |
766 } | |
767 if($verbose){print "samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp\n";} | |
768 | |
769 $num_unmapped_pairs=~s/\n//; | |
770 if($verbose){print "NUM UNMAPPED PAIRS= $num_unmapped_pairs\n";} | |
771 my $REF1_base = ""; | |
772 my $REF2_base = ""; | |
773 my $INFO_1 = ""; | |
774 my $INFO_2 = ""; | |
775 my $ALT_1 = ""; | |
776 my $ALT_2 = ""; | |
777 my $isize = 0; | |
778 my $QUAL = ""; | |
779 my $FORMAT = "GT:"; | |
780 | |
781 #### get 8 bit rand id | |
782 my $BND1_name = join "", map { ("a".."z")[rand 26] } 1..8; | |
783 my $BND2_name = join "", map { ("a".."z")[rand 26] } 1..8; | |
784 $BND1_name=join "_","BND",$BND1_name; | |
785 $BND2_name=join "_","BND",$BND2_name; | |
786 | |
787 my $counts = {CTX => 0, DEL => 0, INS => 0, INV => 0, TDUP => 0, NOV_INS => 0 }; | |
788 my $event_mate_info = {CTX => "", DEL => "", INS => "", INV => "", TDUP => "", NOV_INS => "" }; | |
789 | |
790 #### get mate pair info and counts per event | |
791 foreach my $e (sort keys %{$counts}) { | |
792 my $h = get_counts_n_info($e, $side, $MapQ, $file, $dist_To_Soft, $target_drp, $upper_limit, $lower_limit); | |
793 | |
794 $counts->{$e} = $h->{count}; | |
795 $event_mate_info->{$e} = $h->{info}; | |
796 } | |
797 #print Dumper($counts); | |
798 | |
799 my $max = 0; | |
800 my $type = "UNKNOWN"; | |
801 my $nRP = 0; | |
802 my $mate_info = "NA\tNA\tNA\tNA"; | |
803 my $summary = "GT:"; | |
804 | |
805 #### find max count of events and set type, nRP and info to corresponding | |
806 #### max count event. | |
807 #### also create a summary string of all counts to be added to VCF file. | |
808 foreach my $e (sort keys %{$counts}){ | |
809 # if ($counts->{$e} >=i $max){ | |
810 if ($counts->{$e} > $max){ | |
811 $type = $e .",". $counts->{$e}; | |
812 $nRP = $counts->{$e}; | |
813 | |
814 $max = $counts->{$e}; | |
815 | |
816 if (length($event_mate_info->{$e})) { | |
817 $mate_info = $event_mate_info->{$e}; | |
818 } | |
819 } | |
820 | |
821 $summary .= $e .",". $counts->{$e} .":"; | |
822 } | |
823 # print "done with Summary\n"; | |
824 #### remove last colon ":" from | |
825 $summary =~ s/:$//; | |
826 if (($minRP > $max)&&(!$disable_RP_only )){if ($verbose){print "FAILED BECAUSE ($minRP > $max)&&(!$disable_RP_only )"};return}; | |
827 | |
828 #### Run Levenstein distance on softclip in target region to find out if its a small deletion/insetion | |
829 #### passing 1: clip_seq, 2: chr, 3: start, 4: end, 5: ref file. | |
830 my $levD = new LevD; | |
831 ######################################################## | |
832 ######################################################## | |
833 ######################################################## | |
834 | |
835 #### redefine start and end location for LevD calc. | |
836 # $start = $plus_Reads[1]-$dist_To_Soft; | |
837 # $end = $plus_Reads[2]; | |
838 my $num_bases_to_loc=0; | |
839 my $new_start=0; | |
840 my $new_end=0; | |
841 my $del_seq=""; | |
842 my $start = $start_local; | |
843 my $end = $end_local; | |
844 if ($lSC=~/NA/){$lSC=0} | |
845 | |
846 if ($side =~ /right/) { | |
847 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA); | |
848 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
849 $num_bases_to_loc=$levD->{index}; | |
850 $new_start = $plus_Reads[2]; | |
851 if ($plus_Reads[2]=~/^[0-9]/){$new_end=$plus_Reads[2]+$lSC}; | |
852 } | |
853 else{ | |
854 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA); | |
855 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
856 $num_bases_to_loc=$levD->{index}; | |
857 if ($plus_Reads[2]=~/^[0-9]/){$new_start=$plus_Reads[2]-$lSC}; | |
858 $new_end = $plus_Reads[2]; | |
859 } | |
860 if((!$new_start)||(!$new_end)||($new_start<0)){print "FAILED AT ((!$new_start)||(!$new_end)||($new_start<0))\n";return}; | |
861 | |
862 $del_seq=getBases($plus_Reads[0], $new_start,$new_end,$INPUT_FASTA); | |
863 ############################################################################## | |
864 # #If there is a match, where is the start position of the match? | |
865 # | |
866 ############################################################################## | |
867 | |
868 | |
869 #if $plus_Reads[3] eq "NA", then it was found without soft-clipped reads | |
870 if($plus_Reads[3] !~ /NA/){ | |
871 if (($local_levD < $levD_local_threshold)) { | |
872 return if (!$sv_only); | |
873 #### add value to summary to be written to vcf file. | |
874 $summary = "GT:sDel," . $plus_Reads[6]; | |
875 $type = "sDEL"; | |
876 ########################################################################### | |
877 ##### Printing output | |
878 | |
879 ######################################### | |
880 ##### Get DNA info | |
881 ######################################### | |
882 #$REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
883 $REF1_base = substr($del_seq, 0, 1); | |
884 | |
885 #### this is alt ref. for softclip its $plus_Reads[3] | |
886 $REF2_base = $del_seq; | |
887 $QUAL = 1/($local_levD + 0.001); | |
888 $QUAL = sprintf("%.2f",$QUAL); | |
889 $isize = length($del_seq); | |
890 | |
891 #### svtype = none for sDEL | |
892 #### isize = length($info[3]); | |
893 #### nRP = NA | |
894 #### mate_id = NA | |
895 #### CTX,:DEL,:....sDEL,## | |
896 $INFO_1=join(";", "SVTYPE=NA", "EVENT=$type", "ISIZE=$isize"); | |
897 | |
898 #Add Sample infomration | |
899 my $FORMAT="GT:sDEL"; | |
900 $FORMAT .= ":lSC:nSC:uRP:levD_local"; | |
901 my $SAMPLE= "0/1:"; | |
902 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD"; | |
903 | |
904 #### remove any white spaces. | |
905 $INFO_1=~s/\s//g; | |
906 $INFO_2=~s/\s//g; | |
907 | |
908 $BND1_name =~ s/^BND/LEVD/; | |
909 # If left, then the start position is plus_Reads[1]-isize | |
910 my $start_pos=0; | |
911 #Make sure Ref1 and Ref2 bases are different | |
912 if($REF2_base eq $REF1_base){$REF1_base="NA"} | |
913 if($side=~/left/){$start_pos=$plus_Reads[1]-$isize}else{$start_pos=$plus_Reads[1]}; | |
914 print OUT join ("\t", $plus_Reads[0], $start_pos, $BND1_name, $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
915 if ($verbose){print "No Softclipped reads here!\n"} | |
916 return; | |
917 } | |
918 } | |
919 | |
920 #### Otherwise, look for DRP mate info | |
921 #if($nRP=~/NA/){print "MATE_INFO=$mate_info\tSide=$side\tline=$line\n";} | |
922 my @mate_info_arr = split(/\t/, $mate_info); | |
923 $nRP = $mate_info_arr[3]; | |
924 my $mate_chr=$mate_info_arr[0]; | |
925 | |
926 if((! defined $nRP) || ($nRP =~ /na/i) || ($mate_chr =~ /NA/) ){ | |
927 #PRINT UNKNOWN | |
928 if ($nRP =~ /na/i){print "Can't find SC reads\n" if ($verbose);return}; | |
929 if ($verbose){print "There is an unknown\nNRP=$nRP Mate_CHR=$mate_chr minRP=$minRP\n"} | |
930 $summary .= ":unknown," . $plus_Reads[6]; | |
931 $type = "unknown"; | |
932 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
933 $REF2_base = $plus_Reads[3]; | |
934 $BND1_name =~ s/^BND/UNKNOWN/; | |
935 $QUAL = 1/($local_levD + 0.001); | |
936 $QUAL = sprintf("%.2f",$QUAL); | |
937 $INFO_1=join(";", "SVTYPE=unknown", "EVENT=unknown", "ISIZE=unknown"); | |
938 #Add Sample infomration | |
939 my $FORMAT="GT:sDEL"; | |
940 $FORMAT .= ":lSC:nSC:uRP:levD_local"; | |
941 my $SAMPLE = "0/1:"; | |
942 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD"; | |
943 $SAMPLE=~s/NA/0/g; | |
944 #### remove any white spaces. | |
945 $INFO_1=~s/\s//g; | |
946 #print join ("\t", $plus_Reads[0], $plus_Reads[1], $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
947 | |
948 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $REF2_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
949 return; | |
950 | |
951 } | |
952 #### end if there is no mate info or nRP+uRP<minRP | |
953 if (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP))){ | |
954 print "Something failed here\nif (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP)))\n"; | |
955 return} | |
956 | |
957 ################################################################################## | |
958 # Find out if mates have nearby soft-clips (to refine the breakpoints) | |
959 ################################################################################## | |
960 #Look for evidence of soft-clipping near mate | |
961 my @mate_soft_arr = (); | |
962 my $mate_start = 0; | |
963 my $mate_soft = ""; | |
964 | |
965 @mate_info_arr = split(/\t/, $mate_info); | |
966 | |
967 #### mate start and end locations. | |
968 my $filename = $right_sc; | |
969 | |
970 $start = $mate_info_arr[1] - $dist_To_Soft; | |
971 $end = $mate_info_arr[1]; | |
972 | |
973 if ($side =~ /right/) { | |
974 $start = $mate_info_arr[2]; | |
975 $end = $mate_info_arr[2] + $dist_To_Soft; | |
976 | |
977 $filename = $left_sc; | |
978 } | |
979 | |
980 #### add levenstein distance to Summary | |
981 #print "Calc distal Levd\n"; | |
982 $levD->search(rev_comp($plus_Reads[3]), $mate_info_arr[0], $start, $end, $INPUT_FASTA); | |
983 $distl_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
984 $distl_levD = "NA" if($plus_Reads[3] =~ /NA/); | |
985 #If there is no softclips to string match, then give 0 as quality value | |
986 if ($plus_Reads[3] !~ /NA/){ | |
987 $QUAL=1/($distl_levD + 0.001); | |
988 } | |
989 else { | |
990 $QUAL=0; | |
991 }; | |
992 $QUAL=sprintf("%.2f",$QUAL); | |
993 #### looking for softclips to refine break point | |
994 #### if left look in right and vice-versa. | |
995 $cmd = qq{echo -e "$mate_info_arr[0]\t$start\t$end"}; | |
996 $cmd .= qq{ | awk -F'\t' 'NF==3' | intersectBed -a stdin -b $filename | head -1}; | |
997 print "$cmd\n" if $verbose; | |
998 $mate_soft = `$cmd`; | |
999 | |
1000 $mate_soft =~ s/\n//g; | |
1001 @mate_soft_arr = split(/\s/, $mate_soft); | |
1002 my $NO_MATE_SC=""; | |
1003 if(@mate_soft_arr){ | |
1004 $mate_chr = $mate_soft_arr[0]; | |
1005 $mate_start = $mate_soft_arr[1]; | |
1006 $NO_MATE_SC="APPROXIMATE"; | |
1007 | |
1008 } else{ | |
1009 @mate_info_arr = split(/\s/,$mate_info); | |
1010 $mate_chr = $mate_info_arr[0]; | |
1011 $mate_start = $mate_info_arr[1]; | |
1012 } | |
1013 | |
1014 #end if there is no mate info | |
1015 return if ($mate_chr eq ""); | |
1016 #end if there is no mate info and !disable_RP_only | |
1017 return if (($lSC =~/NA/)&&(!$disable_RP_only)); | |
1018 | |
1019 | |
1020 ########################################################################### | |
1021 ##### Printing output | |
1022 | |
1023 ######################################### | |
1024 # Get DNA info | |
1025 ######################################### | |
1026 #print "PLUS_READS=$plus_Reads[0],$plus_Reads[1]\nMATE=$mate_chr,$mate_start,$INPUT_FASTA\n"; | |
1027 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
1028 | |
1029 ### this is alt ref. for softclip its $plus_Reads[3] | |
1030 $REF2_base = getSeq($mate_chr, $mate_start, $INPUT_FASTA); | |
1031 | |
1032 ######################################### | |
1033 # print in VCF format | |
1034 ######################################### | |
1035 | |
1036 #### abs value to account for left and right reads. | |
1037 $isize = abs($plus_Reads[1]-$mate_start); | |
1038 | |
1039 my $event_type=$type; | |
1040 $event_type=~ s/,|[0-9]//g; | |
1041 $INFO_1=join(";", "$sv_type", "EVENT=$event_type","END=$mate_start", "ISIZE=$isize","MATEID=$BND2_name"); | |
1042 $INFO_2=join(";", "$sv_type", "EVENT=$event_type","END=$plus_Reads[1]", "ISIZE=$isize","MATEID=$BND1_name"); | |
1043 | |
1044 #### remove any white spaces. | |
1045 #### ask: did you mean to remove space from ends? eg. trim() | |
1046 $INFO_1=~s/\s//g; | |
1047 $INFO_2=~s/\s//g; | |
1048 | |
1049 $FORMAT=$summary; | |
1050 $FORMAT=~ s/,|[0-9]//g; | |
1051 $FORMAT .= ":lSC:nSC:uRP:distl_levD"; | |
1052 if($NO_MATE_SC){$INFO_2 .= ":NO_MATE_SC"} | |
1053 my $SAMPLE="0/1:"; | |
1054 $SAMPLE .=$summary; | |
1055 # if($NO_MATE_SC){$SAMPLE.= ":$NO_MATE_SC"} | |
1056 | |
1057 $SAMPLE=~s/[A-Z|,|_]//g; | |
1058 my $MATE_SAMPLE=$SAMPLE; | |
1059 $SAMPLE .= ":$lSC:$nSC:$num_unmapped_pairs:$distl_levD"; | |
1060 $MATE_SAMPLE .=":NA:NA:NA:NA"; | |
1061 $SAMPLE=~s/::/:/g; | |
1062 $MATE_SAMPLE=~s/::/:/g; | |
1063 $MATE_SAMPLE=~s/NA/0/g; | |
1064 $SAMPLE=~s/NA/0/g; | |
1065 | |
1066 if($type !~ /INV/){ | |
1067 $ALT_1 = join("","]",$mate_chr,":",$mate_start,"]",$REF1_base); | |
1068 $ALT_2 = join("",$REF2_base,"[",$plus_Reads[0],":",$plus_Reads[1],"["); | |
1069 # 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND | |
1070 # 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND | |
1071 } else { | |
1072 $ALT_1 = join("", "]", $plus_Reads[0], ":", $plus_Reads[1], "]", $REF2_base); | |
1073 $ALT_2 = join("", $REF1_base, "[", $mate_chr, ":", $mate_start, "["); | |
1074 } | |
1075 | |
1076 if(($mate_chr) && ($plus_Reads[0])){ | |
1077 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $ALT_1, $QUAL,"PASS", $INFO_1, $FORMAT,$SAMPLE,"\n"); | |
1078 print OUT join ("\t", $mate_chr, $mate_start, $BND2_name, $REF2_base, $ALT_2, $QUAL, "PASS", $INFO_2, $FORMAT,$MATE_SAMPLE,"\n"); | |
1079 } | |
1080 } | |
1081 | |
1082 ############################################################################### | |
1083 ############################################################################### | |
1084 sub get_counts_n_info { | |
1085 my ($event, $side, $mapQ, $file, $dist, $target, $upL, $lwL) = @_; | |
1086 | |
1087 my $mate_info = ""; | |
1088 my $cmd = ""; | |
1089 | |
1090 if ($event =~ /^CTX$/i) { | |
1091 #print "CTX side $side\n"; | |
1092 if ($side =~ /right/i) { | |
1093 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1536 $file $target}; | |
1094 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'}; | |
1095 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1096 #if($verbose){print "$cmd\n"} | |
1097 $mate_info=`$cmd`; | |
1098 } else { | |
1099 $cmd = qq{ samtools view $new_blacklist -q $mapQ -f 16 -F 1536 $file $target}; | |
1100 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'}; | |
1101 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1102 #if($verbose){print "$cmd\n"} | |
1103 $mate_info=`$cmd`; | |
1104 } | |
1105 } elsif ($event =~ /^DEL$/i) { | |
1106 #print "DEL side $side\n"; | |
1107 if ($side =~ /right/i) { | |
1108 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
1109 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
1110 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1111 #if($verbose){print "$cmd\n"} | |
1112 $mate_info=`$cmd`; | |
1113 } else { | |
1114 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1568 -f 16 $file $target}; | |
1115 $cmd .= qq{ | awk '{OFS="\\t"} {if((\$7 ~ /=/)&&(\$9<-$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
1116 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1}; | |
1117 #if($verbose){print "$cmd\n"} | |
1118 | |
1119 $mate_info=`$cmd`; | |
1120 } | |
1121 } elsif ($event =~ /^INS$/i) { | |
1122 #print "INS side $side\n"; | |
1123 if ($side =~ /right/i) { | |
1124 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
1125 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<$lwL && \$9 > 0 )){end=\$8+1;print \$3,\$8,end}}'}; | |
1126 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1127 #if($verbose){print "$cmd\n"} | |
1128 $mate_info = `$cmd`; | |
1129 } else { | |
1130 $cmd = qq {samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target}; | |
1131 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>-$lwL && \$9 < 0 )){end=\$8+1;print \$3,\$8,end}}'}; | |
1132 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1133 #if($verbose){print "$cmd\n"} | |
1134 $mate_info = `$cmd`; | |
1135 } | |
1136 } elsif ($event =~ /^INV$/i) { | |
1137 #print "INV side $side\n"; | |
1138 if ($side =~ /right/i) { | |
1139 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1596 $file $target}; | |
1140 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'}; | |
1141 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1}; | |
1142 #if($verbose){print "$cmd\n"} | |
1143 $mate_info = `$cmd`; | |
1144 } else { | |
1145 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 48 -F 1548 $file $target}; | |
1146 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'}; | |
1147 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1148 #if($verbose){print "$cmd\n"} | |
1149 $mate_info = `$cmd`; | |
1150 } | |
1151 } elsif ($event =~ /^TDUP$/i) { | |
1152 #print "TDUP side $side\n"; | |
1153 if ($side =~ /right/i) { | |
1154 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
1155 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
1156 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4>\$8)&&(\$9<0)&& (\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
1157 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1158 #if($verbose){print "$cmd\n"} | |
1159 $mate_info = `$cmd`; | |
1160 } else { | |
1161 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target}; | |
1162 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<-$upL )){end=\$8+1;print \$3,\$8,end}}'}; | |
1163 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4<\$8)&&(\$9>0)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
1164 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1165 #if($verbose){print "$cmd\n"} | |
1166 $mate_info = `$cmd`; | |
1167 } | |
1168 } elsif ($event =~ /^NOV_INS$/i) { | |
1169 #print "NOV_INS side $side\n"; | |
1170 if ($side =~ /right/i) { | |
1171 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 8 -F 1552 $file $target}; | |
1172 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'}; | |
1173 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1174 #if($verbose){print "$cmd\n"} | |
1175 $mate_info = `$cmd`; | |
1176 } else { | |
1177 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 24 -F 1536 $file $target}; | |
1178 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'}; | |
1179 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
1180 #if($verbose){print "$cmd\n"} | |
1181 $mate_info = `$cmd`; | |
1182 } | |
1183 } | |
1184 | |
1185 $mate_info=~s/\n//g; | |
1186 my @tmp=split(/\t/, $mate_info); | |
1187 | |
1188 my $counts = 0; | |
1189 | |
1190 if (defined $tmp[3]) { | |
1191 $tmp[3] =~ s/\n//g; | |
1192 | |
1193 $counts = $tmp[3] if (length($tmp[3])); | |
1194 } | |
1195 return ({count=>$counts, info=>$mate_info}); | |
1196 } |