10
|
1 #!/usr/bin/perl
|
|
2
|
|
3 ####
|
|
4 #### Usage: SoftSearch.pl [-lqrmsd] -b <BAM> -f <Genome.fa> -sam <samtools path> -bed <bedtools path>
|
|
5 #### Created 1-30-2012 by Steven Hart, PhD
|
|
6 #### hart.steven@mayo.edu
|
|
7 #### Required bedtools & samtools to be in path
|
|
8
|
|
9
|
|
10 use lib "/home/plus91/2.4/lib" ;
|
|
11
|
|
12 use Getopt::Long;
|
|
13 use strict;
|
|
14 use warnings;
|
|
15 #use Data::Dumper;
|
|
16 use LevD;
|
|
17 use File::Basename;
|
|
18
|
|
19 my ($INPUT_BAM,$INPUT_FASTA,$OUTPUT_FILE,$minSoft,$minSoftReads,$dist_To_Soft,$bedtools,$samtools);
|
|
20 my ($minRP, $temp_output, $num_sd, $MapQ, $chrom, $unmated_pairs, $minBQ, $pair_only, $disable_RP_only);
|
|
21 my ($levD_local_threshold, $levD_distl_threshold,$pe_upper_limit,$high_qual,$sv_only,$blacklist,$genome_file,$verbose);
|
|
22
|
|
23 my $cmd = "";
|
|
24
|
|
25 #Declare variables
|
|
26 GetOptions(
|
|
27 'b=s' => \$INPUT_BAM,
|
|
28 'f=s' => \$INPUT_FASTA,
|
|
29 'o:s' => \$OUTPUT_FILE,
|
|
30 'm:i' => \$minRP,
|
|
31 'l:i' => \$minSoft,
|
|
32 'r:i' => \$minSoftReads,
|
|
33 't:i' => \$temp_output,
|
|
34 's:s' => \$num_sd,
|
|
35 'd:i' => \$dist_To_Soft,
|
|
36 'q:i' => \$MapQ,
|
|
37 'c:s' => \$chrom,
|
|
38 'u:s' => \$unmated_pairs,
|
|
39 'x:s' => \$minBQ,
|
|
40 'p' => \$pair_only,
|
|
41 'g' => \$disable_RP_only,
|
|
42 'j:s' => \$levD_local_threshold,
|
|
43 'k:s' => \$levD_distl_threshold,
|
|
44 'a:s' => \$pe_upper_limit,
|
|
45 'e:s' => \$high_qual,
|
|
46 'L' => \$sv_only,
|
|
47 'v' => \$verbose,
|
|
48 'blacklist:s' => \$blacklist,
|
|
49 'genome_file:s' => \$genome_file,
|
|
50 "help|h|?" => \&usage);
|
|
51 unless($sv_only){$sv_only=""};
|
|
52 if(defined($INPUT_BAM)){$INPUT_BAM=$INPUT_BAM} else {print usage();die "Where is the BAM file?\n\n"}
|
|
53 if(defined($INPUT_FASTA)){$INPUT_FASTA=$INPUT_FASTA} else {print usage();die "Where is the fasta file?\n\n"}
|
|
54 my ($fn,$pathname) = fileparse($INPUT_BAM,".bam");
|
|
55 my $index=`ls $pathname/$fn*bai|head -1`;
|
|
56 #my $index =`ls \${INPUT_BAM%.bam}*bai`;
|
|
57 #print "INDEX=$index\n";
|
|
58 if(!$index){die "\n\nERROR: you need index your BAM file\n\n"}
|
|
59
|
|
60 ### get current time
|
|
61 print "Start Time : " . &spGetCurDateTime() . "\n";
|
|
62 my $now = time;
|
|
63
|
|
64 #if(defined($OUTPUT_FILE)){$OUTPUT_FILE=$OUTPUT_FILE} else {$OUTPUT_FILE="output.vcf"; print "\nNo outfile specified. Using output.vcf as default\n\n"}
|
|
65 if(defined($minSoft)){$minSoft=$minSoft} else {$minSoft=5}
|
|
66 if(defined($minRP)){$minRP=$minRP} else {$minRP=5}
|
|
67 if(defined($minSoftReads)){$minSoftReads=$minSoftReads} else {$minSoftReads=5}
|
|
68 if(defined($dist_To_Soft)){$dist_To_Soft=$dist_To_Soft} else {$dist_To_Soft=300}
|
|
69 if(defined($num_sd)){$num_sd=$num_sd} else {$num_sd=6}
|
|
70 if(defined($MapQ)){$MapQ=$MapQ} else {$MapQ=20}
|
|
71
|
|
72 unless (defined $pe_upper_limit) { $pe_upper_limit = 10000; }
|
|
73 unless (defined $levD_local_threshold) { $levD_local_threshold = 0.05; }
|
|
74 unless (defined $levD_distl_threshold) { $levD_distl_threshold = 0.05; }
|
|
75 #Get sample name if available
|
|
76 my $SAMPLE_NAME="";
|
|
77 my $OUTNAME ="";
|
|
78 $SAMPLE_NAME=`samtools view -f2 -H $INPUT_BAM|awk '{if(\$1~/^\@RG/){sub("ID:","",\$2);name=\$2;print name}}'|head -1`;
|
|
79 #print $SAMPLE_NAME;
|
|
80 $SAMPLE_NAME=~s/\n//g;
|
|
81 #print $SAMPLE_NAME;
|
|
82 my $increase=rand();
|
|
83 my $fname="test";
|
|
84 $increase++;
|
|
85 if (!$OUTPUT_FILE){
|
|
86 if($SAMPLE_NAME ne ""){$OUTNAME=$SAMPLE_NAME.".vcf"}
|
|
87 else {$OUTNAME="output.vcf"}
|
|
88 }
|
|
89 else{$OUTNAME=$OUTPUT_FILE}
|
|
90
|
|
91 print "Writing results to $OUTNAME\n";
|
|
92
|
|
93
|
|
94 ##Make sure if submitting on SGE, to prepned the "chr". Not all referecne FAST files require "chr", so we shouldn't force the issue.
|
|
95 if(!defined($chrom)){$chrom=""}
|
|
96 if(!defined($unmated_pairs)){$unmated_pairs=0}
|
|
97
|
|
98 my $badQualValue=chr($MapQ);
|
|
99 if(defined($minBQ)){ $badQualValue=chr($minBQ); }
|
|
100
|
|
101 if($badQualValue eq "#"){$badQualValue="\#"}
|
|
102
|
|
103 # adding and cheking for samtools and bedtools in the PATh
|
|
104 ## check for bedtools and samtools in the path
|
|
105 $bedtools=`which intersectBed` ;
|
|
106 if(!defined($bedtools)){die "\nError:\n\tno bedtools. Please install bedtools and add to the path\n";}
|
|
107 #$samtools=`samtools 2>&1`;
|
|
108 $samtools=`which samtools`;
|
|
109 if($samtools !~ /(samtools)/i){die "\nError:\n\tno samtools. Please install samtools and add to the path\n";}
|
|
110
|
|
111 print "Usage = SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -s $num_sd -c $chrom -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME \n\n";
|
|
112 sub usage {
|
|
113 print "\nusage: SoftSearch.pl [-cqlrmsd] -b <BAM> -f <Genome.fa> \n";
|
|
114 print "\t-q\t\tMinimum mapping quality [20]\n";
|
|
115 print "\t-l\t\tMinimum length of soft-clipped segment [5]\n";
|
|
116 print "\t-r\t\tMinimum depth of soft-clipped reads at position [5]\n";
|
|
117 print "\t-m\t\tMinimum number of discordant read pairs [5]\n";
|
|
118 print "\t-s\t\tNumber of sd away from mean to be considered discordant [6]\n";
|
|
119 print "\t-u\t\tNumber of unmated pairs [0]\n";
|
|
120 print "\t-d\t\tMax distance between soft-clipped segments and discordant read pairs [300]\n";
|
|
121 print "\t-o\t\tOutput file name [output.vcf]\n";
|
|
122 print "\t-t\t\tPrint temp files for debugging [no|yes]\n";
|
|
123 print "\t-c\t\tuse only this chrom or chr:pos1-pos2\n";
|
|
124 print "\t-p\t\tuse paired-end mode only. In other words, don't try to find soft-clipping events!\n";
|
|
125 print "\t-g\t\tEnable paired-only seach. This will look for discordant read pairs even without soft clips.\n";
|
|
126 print "\t-a\t\tset the minimum distance for a discordant read pair without soft-clipping info [10000]\n";
|
|
127 print "\t-L\t\tFlag to print out even small deletions (low quality)\n";
|
|
128 print "\t-e\t\tdisable strict quality filtering of base qualities in soft-clipped reads [no]\n";
|
|
129 print "\t-blacklist\tareas of the genome to skip calling. Requires -genome_file\n";
|
|
130 print "\t-genome_file\ttab seperated value of chromosome name and length. Only used with -blacklist option\n\n";
|
|
131
|
|
132 exit 1;
|
|
133 }
|
|
134
|
|
135
|
|
136 #############################################################
|
|
137 # create temporary variable name
|
|
138 #############################################################
|
|
139 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`);
|
|
140 our $random_name=join "", map { ("a".."z")[rand 26] } 1..8;
|
|
141
|
|
142 #############################################################
|
|
143 ## create green list
|
|
144 ##############################################################
|
|
145 #
|
|
146 my $new_blacklist="";
|
|
147 if($blacklist){
|
|
148 if(!$genome_file){die "if using a blacklist, you must also specify the location of a genome_file
|
|
149 The format of the genome_file should be
|
|
150 chrom size
|
|
151 chr1 249250621
|
|
152 chr2 243199373
|
|
153 ...
|
|
154
|
|
155 If using hg19, you can ge the genome file by
|
|
156 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from hg19.chromInfo\" > hg19.genome";}
|
|
157
|
|
158 $cmd=join("","complementBed -i $blacklist -g $genome_file >",$random_name,".bed") ;
|
|
159 system ($cmd);
|
|
160 $new_blacklist=join(""," -L ",$random_name,".bed ");
|
|
161 }
|
|
162
|
|
163 if($verbose){print "CMD=$cmd\nBlacklist is $new_blacklist\n";}
|
|
164
|
|
165
|
|
166
|
|
167
|
|
168
|
|
169 #############################################################
|
|
170 # Calcualte insert size distribution of properly mated reads
|
|
171 #############################################################
|
|
172
|
|
173 #Change for compatability with other operating systems
|
|
174 #my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)**2)}'`;
|
|
175
|
|
176 my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|head -10000|cut -f9|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)^2)}'`;
|
|
177 #my ($mean,$stdev)=split(/ /,$metrics);
|
|
178 my ($mean,$stdev)=split(/\s/,$metrics);
|
|
179 $stdev=~s/\n//;
|
|
180 my $upper_limit=int($mean+($num_sd*$stdev));
|
|
181 my $lower_limit=int($mean-($num_sd*$stdev));
|
|
182 die if (!$mean);
|
|
183 print qq{The mean insert size is $mean +/- $stdev (sd)
|
|
184 The upper limit = $upper_limit
|
|
185 The lower limit = $lower_limit\n
|
|
186 };
|
|
187 if($lower_limit<0){
|
|
188 print "Warning!! Given this insert size distribution, we can not call small indels. No other data will be affected\n";
|
|
189 $lower_limit=1;
|
|
190 }
|
|
191 my $tmp_name=join ("",$random_name,".tmp.bam");
|
|
192 my $random_file_sc = "";
|
|
193 my $command = "";
|
|
194
|
|
195 #############################################################
|
|
196 # Make sam file that has soft clipped reads
|
|
197 #############################################################
|
|
198 #give file a name
|
|
199 if(!defined($pair_only)){
|
|
200 $random_file_sc=join ("",$random_name,".sc.sam");
|
|
201 $command=join ("","samtools view -q $MapQ -F 1024 $INPUT_BAM $chrom $new_blacklist| awk '{OFS=\"\\t\"}{c=0;if(\$6~/S/){++c};if(c == 1){print}}' | perl -ane '\$TR=(\@F[10]=~tr/\#//);if(\$TR<2){print}' > ", $random_file_sc);
|
|
202
|
|
203 print "Making SAM file of soft-clipped reads\n";
|
|
204 if($verbose){ print "$command\n";}
|
|
205 system("$command");
|
|
206
|
|
207 #############################################################
|
|
208 # Find areas that have deep enough soft-clip coverage
|
|
209 print "Identifying soft-clipped regions that are at least $minSoft bp long \n";
|
|
210 open (FILE,"$random_file_sc")||die "Can't open soft-clipped sam file $random_file_sc\n";
|
|
211
|
|
212 my $tmpfile=join("",$random_file_sc,".sc.passfilter");
|
|
213 open (OUT,">$tmpfile")||die "Can't write files here!\n";
|
|
214
|
|
215 while(<FILE>){
|
|
216 @_ = split(/\t/, $_);
|
|
217 #### parse CIGAR string and create a hash of array of each operation
|
|
218 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]);
|
|
219 my $hash;
|
|
220 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR;
|
|
221
|
|
222 #for ($i=0; $i<=$#softclip_pos; $i++) {
|
|
223 foreach my $softclip (@{$hash->{S}}) {
|
|
224 #if ($CIGAR[$softclip_pos[$i]] > $minSoft){
|
|
225 if ($softclip > $minSoft){
|
|
226 ###############Make sure base qualities don't have more than 2 bad marks
|
|
227 my $qual=$_[10];
|
|
228 my $TR=($qual=~tr/$badQualValue//);
|
|
229 if($badQualValue eq "#"){ $TR=($qual=~tr/\#//); }
|
|
230 #Skip the soft clip if there is more than 2 bad qual values
|
|
231 #next if($TR > 2);
|
|
232 # if (!$high_qual){next if($TR > 2);}
|
|
233 print OUT;
|
|
234 last;
|
|
235 }
|
|
236 }
|
|
237 }
|
|
238 close FILE;
|
|
239 close OUT;
|
|
240
|
|
241 $command=join(" ","mv",$tmpfile,$random_file_sc);
|
|
242 if($verbose){ print "$command\n";}
|
|
243 system("$command");
|
|
244 }
|
|
245
|
|
246 #########################################################
|
|
247 #Stack up SoftClips
|
|
248 #########################################################
|
|
249 my $random_file=join("",$random_name,".sc.direction.bed");
|
|
250 if(!defined($pair_only)){
|
|
251 open (FILE,"$random_file_sc")|| die "Can't open sam file\n";
|
|
252 #$random_file=join("",$random_name,".sc.direction");
|
|
253
|
|
254 print "Calling sides of soft-clips\n";
|
|
255 #\nTMPOUT=$random_file\tINPUT=$random_file_sc\n\n";
|
|
256 open (TMPOUT,">$random_file")|| die "Can't create tmp file\n";
|
|
257
|
|
258 while (<FILE>){
|
|
259 @_ = split(/\t/, $_);
|
|
260 #### parse CIGAR string and create a hash of array of each operation
|
|
261 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]);
|
|
262 my $hash;
|
|
263 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR;
|
|
264
|
|
265 #### next if softclips on each end
|
|
266 next if ($_[5] =~ /^[0-9]+S.*S$/);
|
|
267
|
|
268 #### next softclip occurs in the middle
|
|
269 next if ($_[5] =~ /^[0-9]+[^S][0-9].*S.+$/);
|
|
270
|
|
271 my $softclip = $hash->{S}[0];
|
|
272
|
|
273 my $end1 = 0;
|
|
274 my $end2 = 0;
|
|
275 my $softBases = "";
|
|
276 my $right_corrected="";my $left_corrected="";
|
|
277
|
|
278 if ($softclip > $minSoft) {
|
|
279
|
|
280 ####If the soft clip occurs at end of read and its on the minus strand, then it's a right clip
|
|
281 if ($_[5] =~ /^.*S$/) {
|
|
282 $end1=$_[3]+length($_[9])-$softclip-1;
|
|
283 $end2=$end1+1;
|
|
284 next if ($end1<0);
|
|
285 #RIGHT clip on Minus
|
|
286 $softBases=substr($_[9], length($_[9])-$softclip, length($_[9]));
|
|
287 #Right clips don't always get clipped correctly, so fix that
|
|
288 # Check to see if sc base matches ref
|
|
289 $right_corrected=baseCheck($_[2],$end2,"right",$softBases);
|
|
290 print TMPOUT "$right_corrected\n"
|
|
291
|
|
292 } else {
|
|
293 #### Begins with S (left clip)
|
|
294 $end1=$_[3]-$softclip;
|
|
295 next if ($end1<0);
|
|
296
|
|
297 $softBases=substr($_[9], 0,$softclip);#print "TMP=$softBases\n";
|
|
298 $left_corrected=baseCheck($_[2],$end1,"left",$softBases);
|
|
299 if(!$left_corrected){print "baseCheck($_[2],$end1,left,$softBases)\n";next}
|
|
300 print TMPOUT "$left_corrected\n";
|
|
301 #print "\nSEQ=$_[9]\t\n";
|
|
302
|
|
303 }
|
|
304 }
|
|
305 }
|
|
306 close FILE;
|
|
307 close TMPOUT;
|
|
308 }
|
|
309 sub baseCheck{
|
|
310 my ($chrom,$pos,$direction,$softBases)=@_;
|
|
311 #skip if position is less than 0, which is caused by MT DNA
|
|
312 return if ($pos<0);
|
|
313 my $exit="";
|
|
314
|
|
315 while(!$exit){
|
|
316 if($direction=~/right/){
|
|
317 my $refBase=getSeq($chrom,$pos,$INPUT_FASTA);
|
|
318 my $softBase=substr($softBases,0,1);
|
|
319 if ($softBase !~ /$refBase/){
|
|
320 my $value=join("\t",$chrom,$pos,$pos+1,join("|",$softBases,$direction));
|
|
321 $exit="STOP";
|
|
322 return $value;
|
|
323 }
|
|
324 else{
|
|
325 $pos=$pos+1;
|
|
326 $softBases=substr($softBases, 1,length($softBases));
|
|
327 }
|
|
328 }
|
|
329 else{
|
|
330 my $refBase=getSeq($chrom,$pos+1,$INPUT_FASTA);
|
|
331 my $softBase=substr($softBases,-1,1);
|
|
332 if ($softBase !~ /$refBase/){
|
|
333 $pos=$pos-1+length($softBases);
|
|
334 my $value=join("\t",$chrom,$pos-1,$pos,join("|",$softBases,$direction));
|
|
335 $exit="STOP";
|
|
336 return $value;
|
|
337 }
|
|
338 else{
|
|
339 $pos=$pos-1;
|
|
340 $softBases=substr($softBases, 0, -1);
|
|
341 #print "Trying again $softBases\n";
|
|
342 }
|
|
343
|
|
344 }
|
|
345
|
|
346 }
|
|
347 }
|
|
348 #Remove SAM files to conserve space
|
|
349 unlink($random_file_sc);
|
|
350
|
|
351
|
|
352 my $random_file_disc="$INPUT_BAM";
|
|
353 ###
|
|
354 #
|
|
355 ######################################################
|
|
356 # Transform Read pair groups into softclip equivalents
|
|
357 ######################################################
|
|
358 #
|
|
359 #
|
|
360 #
|
|
361 my $v="";
|
|
362 #if($disable_RP_only){
|
|
363 print "Running Bam2pair.pl\n";
|
|
364 print "Looking for discordant read pairs without requiring soft-clipping information\n";
|
|
365 use FindBin qw($Bin);
|
|
366 my $path=$Bin;
|
|
367 # print"\n\nPATH=$path\n\n";
|
|
368 if($verbose){$v="-v"}
|
|
369 my $tmp_out=join("",$random_name,"RP.out");
|
|
370 $command=join("","perl ",$path,"/Bam2pair.pl -b $random_file_disc -o $tmp_out -isize $pe_upper_limit -winsize $dist_To_Soft -min $minRP -chrom $chrom -prefix $random_name -q $MapQ -blacklist $random_name.bed $v");
|
|
371 if($verbose){ print "$command\n"};
|
|
372 system("$command");
|
|
373 $command=join("","perl -ane '\$end1=\@F[1];\$end2=\@F[3];print join(\"\\t\",\@F[0..1],\$end1,\"unknown|left\");print \"\\n\";print join(\"\\t\",\@F[2..3],\$end2,\"unknown|left\");print \"\\n\"' ", $tmp_out," >> ",$random_file);
|
|
374 if($verbose){print "$command\n"};
|
|
375 system($command);
|
|
376 unlink($tmp_out);
|
|
377 #}
|
|
378 #
|
|
379
|
|
380
|
|
381 ######################################################
|
|
382 unlink("$random_file","$tmp_name","$random_file","$index","$random_name","$new_blacklist") if (-z $random_file || ! -e $random_file ) ;
|
|
383 if (-z $random_file || ! -e $random_file){
|
|
384 print "Softclipped file is empty($random_file).\nNo soft clipping found using desired paramters\n\n";
|
|
385 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
|
|
386 &print_header();
|
|
387 close OUT;
|
|
388 exit;
|
|
389 }
|
|
390
|
|
391
|
|
392 #############################################################
|
|
393 # Make sure there are enough soft-clippped supporting reads
|
|
394 #############################################################
|
|
395 my $outfile=join("",$random_file,".sc.merge.bed");
|
|
396 #sortbed -i .sc.direction | mergeBed -nms -d 25 -i stdin > .sc.merge.bed
|
|
397 $command=join(" ","sortBed -i",$random_file," | mergeBed -nms -i stdin","|egrep \";|,\"","|awk '{OFS=\"\t\"}(NF==4)'",">",$outfile);
|
|
398
|
|
399 print "$command\n" if ($verbose);
|
|
400 system("$command");
|
|
401
|
|
402 if (-z $outfile || ! -e $outfile){
|
|
403 unlink("$tmp_name","$random_file","$outfile","$index","$random_name","$new_blacklist");
|
|
404 print "mergeBed file is empty.\nNo strucutral variants found\n\n" ;
|
|
405 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
|
|
406 &print_header();
|
|
407 close OUT;
|
|
408 exit;
|
|
409 }
|
|
410
|
|
411 print "completed mergeBed\n";
|
|
412
|
|
413 ###############################################################
|
|
414 # If left and right are on the same line, make into 2 lines
|
|
415 ###############################################################
|
|
416 open (INFILE,$outfile)||die "couldn't open temp file : $. \n\n";
|
|
417 my $tmp2=join("",$random_name,".sc.fixed.merge.bed");
|
|
418 #print "INFILE=$outfile\tOUTFILE=$tmp2\n\n";
|
|
419 #INPUT FORMAT=chr9\t131467\t131473\tATGCTTATTAAAA|left;TTATTAAAAGCATA|left
|
|
420 open (OUTFILE,">$tmp2")||die "couldn't create temp file : $. \n\n";
|
|
421 while(<INFILE>){
|
|
422 chomp $_;
|
|
423 my $l = $_;
|
|
424
|
|
425 my @a = split(/\t/, $l);
|
|
426 my $info = $a[3];
|
|
427 my @info_arr = split(/\;/, $info);
|
|
428 my @left_arr=();
|
|
429 my @right_arr=();
|
|
430 @left_arr = grep(/left/, @info_arr);
|
|
431 @right_arr = grep(/right/, @info_arr);
|
|
432
|
|
433 #New
|
|
434 my $left = join(";", @left_arr);
|
|
435 my $right = join(";", @right_arr);
|
|
436 $info = join(";", @info_arr);
|
|
437
|
|
438 if((@left_arr) && (@right_arr)){
|
|
439 print OUTFILE "$a[0]\t$a[1]\t$a[2]\t$left\n$a[0]\t$a[1]\t$a[2]\t$right\n";
|
|
440 } else{
|
|
441 my $all=join("\t",@a[0..2],$info);
|
|
442 print OUTFILE "$all\n";
|
|
443 }
|
|
444 }
|
|
445
|
|
446 # make sure output file name is $outfile
|
|
447 $command=join(" ","sed -e '/ /s//\t/g'", $tmp2,"|awk 'BEGIN{OFS=\"\\t\"}(NF==4)'", "|perl -pne 's/ /\t/g'>",$outfile);
|
|
448 system("$command");
|
|
449 if($verbose){print "$command\n"};
|
|
450 unlink("$tmp_name","$random_file","$tmp2","$outfile","$index","random_name","$new_blacklist") if (-z $outfile || ! -e $outfile) ;
|
|
451 if (-z $outfile || ! -e $outfile){
|
|
452 print "Fixed mergeBed file is empty($outfile).\nNo strucutral variants found\n\n";
|
|
453 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
|
|
454 &print_header();
|
|
455 close OUT;
|
|
456 exit;
|
|
457 }
|
|
458
|
|
459 print "completed fixing mergeBed\n\n";
|
|
460
|
|
461 ###############################################################
|
|
462 # Seperate directions of soft clips
|
|
463 ###############################################################
|
|
464 my $left_sc = join("", "left", $tmp2);
|
|
465 my $right_sc = join("", "right", $tmp2);
|
|
466 use FindBin qw($Bin);
|
|
467 #my $path=$Bin;
|
|
468
|
|
469 $command=join("","grep left ", $tmp2, " |sed -e '/left /s//left\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$left_sc);
|
|
470 system("$command");
|
|
471 #print "$command\n";
|
|
472 $command=join("","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$right_sc);
|
|
473 #$command=join(" ","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g' >",$right_sc);
|
|
474 system("$command");
|
|
475 #print "$command\n";
|
|
476 #die "CHECK $right_sc\n";
|
|
477
|
|
478 ###############################################################
|
|
479 # Count the number and identify directions of soft clips
|
|
480 ###############################################################
|
|
481 print "Count the number and identify directions of soft clips\n";
|
|
482 #print "looking in $outfile\n";
|
|
483 $outfile=join("",$random_name,".sc.fixed.merge.bed");
|
|
484
|
|
485 open (INFILE,$outfile)||die "couldn't open temp file\n\n";
|
|
486 my $tmp3 = join("", $random_file, "predSV");
|
|
487 open (OUTFILE, ">$tmp3")||die "couldn't create temp file\n\n";
|
|
488 while(<INFILE>){
|
|
489 chomp;
|
|
490 @_=split(/\t/,$_);
|
|
491 my $count=tr/\;//;$count+=tr/\,//;
|
|
492 $count=$count+1;
|
|
493 my $left=0;
|
|
494 my $right=0;
|
|
495
|
|
496 while ($_ =~ /left/g) { $left++ } # count number of right clips
|
|
497 while ($_ =~ /right/g) { $right++ } # count number of left clips
|
|
498
|
|
499 ###############################################################
|
|
500 if ($count >= $minSoftReads){
|
|
501 ####get longets soft-clipped read
|
|
502 my @clips=split(/\;|,|\|/,$_[3]);
|
|
503
|
|
504 my ($max, $temp, $temp2, $temp3, $dir, $maxSclip) = (0) x 6;
|
|
505 for (my $i=0; $i<$count; $i++) {
|
|
506 my $plus1=$i+1;
|
|
507 $temp=length($clips[$i]);
|
|
508 $temp2=$clips[$plus1];
|
|
509 $temp3=$clips[$i];
|
|
510
|
|
511 if ($temp > $max){
|
|
512 $maxSclip=$temp3;
|
|
513 $max =$temp;
|
|
514 $dir=$temp2;
|
|
515 } else {
|
|
516 $max=$max;
|
|
517 $dir=$dir;
|
|
518 $maxSclip=$maxSclip;
|
|
519 }
|
|
520 $i++;
|
|
521 }
|
|
522 my $order2 = join("|", $left, $right);
|
|
523 #print join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n";
|
|
524 print OUTFILE join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n";
|
|
525 } elsif($_=~/unknown/){
|
|
526 print OUTFILE join ("\t",@_[0..2],"NA","NA","left","NA","NA|NA") . "\n";
|
|
527 print OUTFILE join ("\t",@_[0..2],"NA","NA","right","NA","NA|NA") . "\n";
|
|
528 }
|
|
529 ####Format is Chrom,start, end,longest Soft-clip,length of longest Soft-clip, direction of longest soft-clip,#supporting softclips,#right Sclips|#left Sclips
|
|
530 }
|
|
531 close INFILE;
|
|
532 close OUTFILE;
|
|
533
|
|
534 unlink("$tmp2","$tmp_name","$random_file","$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$new_blacklist") if (-z $tmp3 || !-e $tmp3) ;
|
|
535
|
|
536 if (-z $tmp3 || !-e $tmp3){
|
|
537 print "No structural variants found while Counting the number and identify directions of soft clips.\n" ;
|
|
538
|
|
539 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
|
|
540 &print_header();
|
|
541 close OUT;
|
|
542 exit;
|
|
543
|
|
544 }
|
|
545
|
|
546 print "Done counting Softclipped reads\n";
|
|
547 ###############################################################
|
|
548 #### Print header information
|
|
549 ###############################################################
|
|
550 open (OUT,">$OUTNAME")||die "Can't write files here!\n";
|
|
551 &print_header();
|
|
552 close OUT;
|
|
553
|
|
554 ###############################################################
|
|
555 ###############################################################
|
|
556 #### DO the bulk of the work
|
|
557 ###############################################################
|
|
558 use List::Util qw(min max);
|
|
559 open (FILE,"$tmp3")|| die "Can't open file\n";
|
|
560 open (OUT,">>$OUTNAME")|| die "Can't open file\n";
|
|
561
|
|
562 #print "\nusing $tmp3 and writing to $OUTPUT_FILE \n";
|
|
563 while (<FILE>){
|
|
564 #If left clip {+- or -- or -+ }{+- are uninformative b/c they go upstream}
|
|
565 #If right clip {++ or -- or +-}
|
|
566 chomp $_;
|
|
567 my $line = $_;
|
|
568 my @info = split(/\t/, $_);
|
|
569
|
|
570 if($info[5] eq "left") {
|
|
571 bulk_work("left", $line, $random_file_disc);
|
|
572
|
|
573 } elsif ($info[5] eq "right") {
|
|
574 bulk_work("right", $line, $random_file_disc);
|
|
575 }
|
|
576 #if($. ==6){print "THIS IS LINE 6\n$_\n";die}
|
|
577 print "Completed line $.\n" if ($verbose);
|
|
578 }
|
|
579 close FILE;
|
|
580 close OUT;
|
|
581
|
|
582 ###############################################################################
|
|
583 ###############################################################################
|
|
584 #### Delete temp files
|
|
585 my $meregedBed=join("",$random_name,".sc.direction.bed.sc.merge.bed");
|
|
586
|
|
587 if(defined($temp_output)){$temp_output=$temp_output} else {$temp_output="no"}
|
|
588
|
|
589 if ($temp_output eq "no"){
|
|
590 unlink("$tmp_name","$random_file","$tmp2",,"$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$meregedBed","$random_name.bed");
|
|
591 }
|
|
592 ####Sort VCF
|
|
593 my $tmp=join(".",$random_name,"tmp");
|
|
594 #Get header
|
|
595 $cmd="grep \"#\" $OUTNAME > $tmp";
|
|
596 system($cmd);
|
|
597 #sort results
|
|
598 $cmd="grep -v \"#\" $OUTNAME|perl -pne 's/chr//'|sort -k1,1n -k2,2n|perl -ne 'print \"chr\".\$_' >>$tmp";
|
|
599 system($cmd);
|
|
600 $cmd="mv $tmp $OUTNAME";
|
|
601 system($cmd);
|
|
602 #remove entries next to each other
|
|
603
|
|
604
|
|
605
|
|
606
|
|
607 #############################################################
|
|
608 ##May not need this anymore since filtering on left and right
|
|
609 #############################################################
|
|
610 #my $tmpout=$OUTNAME;
|
|
611 #$tmpout.=".tmp";
|
|
612 #use FindBin qw($Bin);
|
|
613 ##my $path=$Bin;
|
|
614 #$command="perl ".$path."/Extract_nSC.pl $OUTNAME -q nSC > $tmpout";
|
|
615 ##print "Command=$command\n";
|
|
616 #system($command);
|
|
617 #$command="perl ".$path."/reduce_redundancy.pl $tmpout $upper_limit |cut -f1-10 > $OUTNAME";
|
|
618 ##print "$command\n";
|
|
619 #system($command);
|
|
620 #system("rm $tmpout");
|
|
621 ########################################################
|
|
622
|
|
623
|
|
624
|
|
625
|
|
626 print "Analysis Completed\n\nYou did it!!!\n";
|
|
627 print "Finish Time : " . &spGetCurDateTime() . "\n";
|
|
628 $now = time - $now;
|
|
629 printf("\n\nTotal running time: %02d:%02d:%02d\n\n", int($now / 3600), int(($now % 3600) / 60),
|
|
630 int($now % 60));
|
|
631
|
|
632 exit;
|
|
633
|
|
634 ###############################################################################
|
|
635 sub rev_comp {
|
|
636 my $dna = shift;
|
|
637 my $revcomp = reverse($dna);
|
|
638 $revcomp =~ tr/ACGTacgt/TGCAtgca/;
|
|
639
|
|
640 return $revcomp;
|
|
641 }
|
|
642
|
|
643
|
|
644 ###############################################################################
|
|
645 #### to get reference base
|
|
646 sub getSeq{
|
|
647 my ($chr,$pos,$fasta)=@_;
|
|
648 #don't require chr
|
|
649 #if($chr !~ /^chr/){die "$chr is not correct\n";}
|
|
650 # die "$pos is not a number\n" if ($pos <0);
|
|
651 my @result=();
|
|
652 if ($pos <0){print "$pos is not a valid position (likely caused by circular MT chromosome)\n";return;}
|
|
653
|
|
654 @result = `samtools faidx $fasta $chr:$pos-$pos`;
|
|
655 if($result[1]){chomp($result[1]);
|
|
656 return uc($result[1]);
|
|
657 }
|
|
658 return("NA");
|
|
659 #### after return will not be printed
|
|
660 ####print "RESULTS=@result\n";
|
|
661 }
|
|
662
|
|
663 sub getBases{
|
|
664 my ($chr,$pos1,$pos2,$fasta)=@_;
|
|
665 #don't require chr
|
|
666 #if($chr !~ /^chr/){die "$chr is not correct\n";}
|
|
667 my @result=();
|
|
668 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";return;};
|
|
669
|
|
670 @result = `samtools faidx $fasta $chr:$pos1-$pos2`;
|
|
671 if(!$result[1]){$result[1]="NA"};
|
|
672 chomp($result[1]);
|
|
673 return uc($result[1]);
|
|
674
|
|
675 #### after return will not be printed
|
|
676 ####print "RESULTS=@result\n";
|
|
677 }
|
|
678 ###############################################################################
|
|
679 #### to get time
|
|
680 sub spGetCurDateTime {
|
|
681 my ($sec, $min, $hour, $mday, $mon, $year) = localtime();
|
|
682 my $curDateTime = sprintf "%4d-%02d-%02d %02d:%02d:%02d",
|
|
683 $year+1900, $mon+1, $mday, $hour, $min, $sec;
|
|
684 return ($curDateTime);
|
|
685 }
|
|
686
|
|
687
|
|
688 ###############################################################################
|
|
689 #### print header
|
|
690 sub print_header {
|
|
691 my $date=&spGetCurDateTime();
|
|
692 my $header = qq{##fileformat=VCFv4.1
|
|
693 ##fileDate=$date
|
|
694 ##source=SoftSearch.pl
|
|
695 ##reference=$INPUT_FASTA
|
|
696 ##Usage= SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -u $unmated_pairs -s $num_sd -b $INPUT_BAM -f $INPUT_FASTA -o $OUTNAME
|
|
697 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
|
|
698 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
|
|
699 ##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
|
|
700 ##INFO=<ID=ISIZE,Number=.,Type=String,Description="Size of the SV">
|
|
701 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
|
|
702 ##FORMAT=<ID=lSC,Number=1,Type=Integer,Description="Length of the longest soft clips supporting the BND">
|
|
703 ##FORMAT=<ID=nSC,Number=1,Type=Integer,Description="Number of supporting soft-clips\">
|
|
704 ##FORMAT=<ID=uRP,Number=1,Type=Integer,Description="Number of unmated read pairs nearby Soft-Clips">
|
|
705 ##FORMAT=<ID=levD_local,Number=1,Type=Float,Description="Levenstein distance between soft-clipped bases and the area around the original soft-clipped site">
|
|
706 ##FORMAT=<ID=levD_distl,Number=1,Type=Float,Description="Levenstein distance between the soft-clipped bases and mate location">
|
|
707 ##FORMAT=<ID=CTX,Number=1,Type=Integer,Description="Number of chromosomal translocations">
|
|
708 ##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="Number of reads supporting Large Deletions">
|
|
709 ##FORMAT=<ID=INS,Number=1,Type=Integer,Description="Number of reads supporting Large insertions">
|
|
710 ##FORMAT=<ID=NOV_INS,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion">
|
|
711 ##FORMAT=<ID=TDUP,Number=1,Type=Integer,Description="Number of reads supporting a tandem duplication">
|
|
712 ##FORMAT=<ID=INV,Number=1,Type=Integer,Description="Number of reads supporting inversions">
|
|
713 ##FORMAT=<ID=sDEL,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion">
|
|
714 ##INFO=<ID=NO_MATE_SC,Number=1,Type=Flag,Description="When there is no softclipping of the mate read location, an appromiate position is used">
|
|
715 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Dummy value for maintaining VCF-Spec">
|
|
716 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$SAMPLE_NAME\n};
|
|
717
|
|
718 print OUT $header;
|
|
719 }
|
|
720
|
|
721
|
|
722 ###############################################################################
|
|
723 sub bulk_work {
|
|
724 print "#####################################@_\n" if ($verbose);
|
|
725 my ($side, $line, $file) = @_;
|
|
726 my $local_levD = 0;
|
|
727 my $distl_levD = 0;
|
|
728
|
|
729 #my @info = split(/\t/, $line);
|
|
730 my @plus_Reads = split(/\t/, $line);
|
|
731 $plus_Reads[7] =~ s/\n//g;
|
|
732
|
|
733 #### softclip length and softclip size.
|
|
734 my $lSC = $plus_Reads[4];
|
|
735 my $nSC = $plus_Reads[6];
|
|
736
|
|
737
|
|
738 #Get all types of compatible reads
|
|
739 #Get improperly paired reads (@ max distance)
|
|
740
|
|
741 #### default value for left SIDE.
|
|
742 #If left-clip, then look downstream for match of softclipped reads to define a deletion, but look for DRPs upstream
|
|
743 my $sv_type = "SVTYPE=BND";
|
|
744 my $start_local=0; my $end_local=0;my $target_local="";my $target_drp="";my $start_drp="";my $end_drp="";
|
|
745 if ($side =~ /left/) {
|
|
746 $start_local = $plus_Reads[1]-$dist_To_Soft;
|
|
747 $end_local = $plus_Reads[2];
|
|
748 $start_drp = $plus_Reads[1];
|
|
749 $end_drp = $plus_Reads[1]+$dist_To_Soft;
|
|
750
|
|
751 }
|
|
752 else{
|
|
753 $start_local = $plus_Reads[1];
|
|
754 $end_local = $plus_Reads[1]+$dist_To_Soft;
|
|
755 $start_drp = $plus_Reads[1]-$dist_To_Soft;
|
|
756 $end_drp = $plus_Reads[1];
|
|
757 }
|
|
758
|
|
759 $target_local=join("", $plus_Reads[0], ":", $start_local, "-", $end_local);
|
|
760 $target_drp=join("", $plus_Reads[0], ":", $start_drp, "-", $end_drp);
|
|
761 my $num_unmapped_pairs="";
|
|
762 if ($side =~ /right/) {
|
|
763 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f8 -F 1536 -c $INPUT_BAM $target_drp`;
|
|
764 } else {
|
|
765 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp`;
|
|
766 }
|
|
767 if($verbose){print "samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $INPUT_BAM $target_drp\n";}
|
|
768
|
|
769 $num_unmapped_pairs=~s/\n//;
|
|
770 if($verbose){print "NUM UNMAPPED PAIRS= $num_unmapped_pairs\n";}
|
|
771 my $REF1_base = "";
|
|
772 my $REF2_base = "";
|
|
773 my $INFO_1 = "";
|
|
774 my $INFO_2 = "";
|
|
775 my $ALT_1 = "";
|
|
776 my $ALT_2 = "";
|
|
777 my $isize = 0;
|
|
778 my $QUAL = "";
|
|
779 my $FORMAT = "GT:";
|
|
780
|
|
781 #### get 8 bit rand id
|
|
782 my $BND1_name = join "", map { ("a".."z")[rand 26] } 1..8;
|
|
783 my $BND2_name = join "", map { ("a".."z")[rand 26] } 1..8;
|
|
784 $BND1_name=join "_","BND",$BND1_name;
|
|
785 $BND2_name=join "_","BND",$BND2_name;
|
|
786
|
|
787 my $counts = {CTX => 0, DEL => 0, INS => 0, INV => 0, TDUP => 0, NOV_INS => 0 };
|
|
788 my $event_mate_info = {CTX => "", DEL => "", INS => "", INV => "", TDUP => "", NOV_INS => "" };
|
|
789
|
|
790 #### get mate pair info and counts per event
|
|
791 foreach my $e (sort keys %{$counts}) {
|
|
792 my $h = get_counts_n_info($e, $side, $MapQ, $file, $dist_To_Soft, $target_drp, $upper_limit, $lower_limit);
|
|
793
|
|
794 $counts->{$e} = $h->{count};
|
|
795 $event_mate_info->{$e} = $h->{info};
|
|
796 }
|
|
797 #print Dumper($counts);
|
|
798
|
|
799 my $max = 0;
|
|
800 my $type = "UNKNOWN";
|
|
801 my $nRP = 0;
|
|
802 my $mate_info = "NA\tNA\tNA\tNA";
|
|
803 my $summary = "GT:";
|
|
804
|
|
805 #### find max count of events and set type, nRP and info to corresponding
|
|
806 #### max count event.
|
|
807 #### also create a summary string of all counts to be added to VCF file.
|
|
808 foreach my $e (sort keys %{$counts}){
|
|
809 # if ($counts->{$e} >=i $max){
|
|
810 if ($counts->{$e} > $max){
|
|
811 $type = $e .",". $counts->{$e};
|
|
812 $nRP = $counts->{$e};
|
|
813
|
|
814 $max = $counts->{$e};
|
|
815
|
|
816 if (length($event_mate_info->{$e})) {
|
|
817 $mate_info = $event_mate_info->{$e};
|
|
818 }
|
|
819 }
|
|
820
|
|
821 $summary .= $e .",". $counts->{$e} .":";
|
|
822 }
|
|
823 # print "done with Summary\n";
|
|
824 #### remove last colon ":" from
|
|
825 $summary =~ s/:$//;
|
|
826 if (($minRP > $max)&&(!$disable_RP_only )){if ($verbose){print "FAILED BECAUSE ($minRP > $max)&&(!$disable_RP_only )"};return};
|
|
827
|
|
828 #### Run Levenstein distance on softclip in target region to find out if its a small deletion/insetion
|
|
829 #### passing 1: clip_seq, 2: chr, 3: start, 4: end, 5: ref file.
|
|
830 my $levD = new LevD;
|
|
831 ########################################################
|
|
832 ########################################################
|
|
833 ########################################################
|
|
834
|
|
835 #### redefine start and end location for LevD calc.
|
|
836 # $start = $plus_Reads[1]-$dist_To_Soft;
|
|
837 # $end = $plus_Reads[2];
|
|
838 my $num_bases_to_loc=0;
|
|
839 my $new_start=0;
|
|
840 my $new_end=0;
|
|
841 my $del_seq="";
|
|
842 my $start = $start_local;
|
|
843 my $end = $end_local;
|
|
844 if ($lSC=~/NA/){$lSC=0}
|
|
845
|
|
846 if ($side =~ /right/) {
|
|
847 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA);
|
|
848 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist});
|
|
849 $num_bases_to_loc=$levD->{index};
|
|
850 $new_start = $plus_Reads[2];
|
|
851 if ($plus_Reads[2]=~/^[0-9]/){$new_end=$plus_Reads[2]+$lSC};
|
|
852 }
|
|
853 else{
|
|
854 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA);
|
|
855 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist});
|
|
856 $num_bases_to_loc=$levD->{index};
|
|
857 if ($plus_Reads[2]=~/^[0-9]/){$new_start=$plus_Reads[2]-$lSC};
|
|
858 $new_end = $plus_Reads[2];
|
|
859 }
|
|
860 if((!$new_start)||(!$new_end)||($new_start<0)){print "FAILED AT ((!$new_start)||(!$new_end)||($new_start<0))\n";return};
|
|
861
|
|
862 $del_seq=getBases($plus_Reads[0], $new_start,$new_end,$INPUT_FASTA);
|
|
863 ##############################################################################
|
|
864 # #If there is a match, where is the start position of the match?
|
|
865 #
|
|
866 ##############################################################################
|
|
867
|
|
868
|
|
869 #if $plus_Reads[3] eq "NA", then it was found without soft-clipped reads
|
|
870 if($plus_Reads[3] !~ /NA/){
|
|
871 if (($local_levD < $levD_local_threshold)) {
|
|
872 return if (!$sv_only);
|
|
873 #### add value to summary to be written to vcf file.
|
|
874 $summary = "GT:sDel," . $plus_Reads[6];
|
|
875 $type = "sDEL";
|
|
876 ###########################################################################
|
|
877 ##### Printing output
|
|
878
|
|
879 #########################################
|
|
880 ##### Get DNA info
|
|
881 #########################################
|
|
882 #$REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
|
|
883 $REF1_base = substr($del_seq, 0, 1);
|
|
884
|
|
885 #### this is alt ref. for softclip its $plus_Reads[3]
|
|
886 $REF2_base = $del_seq;
|
|
887 $QUAL = 1/($local_levD + 0.001);
|
|
888 $QUAL = sprintf("%.2f",$QUAL);
|
|
889 $isize = length($del_seq);
|
|
890
|
|
891 #### svtype = none for sDEL
|
|
892 #### isize = length($info[3]);
|
|
893 #### nRP = NA
|
|
894 #### mate_id = NA
|
|
895 #### CTX,:DEL,:....sDEL,##
|
|
896 $INFO_1=join(";", "SVTYPE=NA", "EVENT=$type", "ISIZE=$isize");
|
|
897
|
|
898 #Add Sample infomration
|
|
899 my $FORMAT="GT:sDEL";
|
|
900 $FORMAT .= ":lSC:nSC:uRP:levD_local";
|
|
901 my $SAMPLE= "0/1:";
|
|
902 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD";
|
|
903
|
|
904 #### remove any white spaces.
|
|
905 $INFO_1=~s/\s//g;
|
|
906 $INFO_2=~s/\s//g;
|
|
907
|
|
908 $BND1_name =~ s/^BND/LEVD/;
|
|
909 # If left, then the start position is plus_Reads[1]-isize
|
|
910 my $start_pos=0;
|
|
911 #Make sure Ref1 and Ref2 bases are different
|
|
912 if($REF2_base eq $REF1_base){$REF1_base="NA"}
|
|
913 if($side=~/left/){$start_pos=$plus_Reads[1]-$isize}else{$start_pos=$plus_Reads[1]};
|
|
914 print OUT join ("\t", $plus_Reads[0], $start_pos, $BND1_name, $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
|
|
915 if ($verbose){print "No Softclipped reads here!\n"}
|
|
916 return;
|
|
917 }
|
|
918 }
|
|
919
|
|
920 #### Otherwise, look for DRP mate info
|
|
921 #if($nRP=~/NA/){print "MATE_INFO=$mate_info\tSide=$side\tline=$line\n";}
|
|
922 my @mate_info_arr = split(/\t/, $mate_info);
|
|
923 $nRP = $mate_info_arr[3];
|
|
924 my $mate_chr=$mate_info_arr[0];
|
|
925
|
|
926 if((! defined $nRP) || ($nRP =~ /na/i) || ($mate_chr =~ /NA/) ){
|
|
927 #PRINT UNKNOWN
|
|
928 if ($nRP =~ /na/i){print "Can't find SC reads\n" if ($verbose);return};
|
|
929 if ($verbose){print "There is an unknown\nNRP=$nRP Mate_CHR=$mate_chr minRP=$minRP\n"}
|
|
930 $summary .= ":unknown," . $plus_Reads[6];
|
|
931 $type = "unknown";
|
|
932 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
|
|
933 $REF2_base = $plus_Reads[3];
|
|
934 $BND1_name =~ s/^BND/UNKNOWN/;
|
|
935 $QUAL = 1/($local_levD + 0.001);
|
|
936 $QUAL = sprintf("%.2f",$QUAL);
|
|
937 $INFO_1=join(";", "SVTYPE=unknown", "EVENT=unknown", "ISIZE=unknown");
|
|
938 #Add Sample infomration
|
|
939 my $FORMAT="GT:sDEL";
|
|
940 $FORMAT .= ":lSC:nSC:uRP:levD_local";
|
|
941 my $SAMPLE = "0/1:";
|
|
942 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD";
|
|
943 $SAMPLE=~s/NA/0/g;
|
|
944 #### remove any white spaces.
|
|
945 $INFO_1=~s/\s//g;
|
|
946 #print join ("\t", $plus_Reads[0], $plus_Reads[1], $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
|
|
947
|
|
948 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $REF2_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n");
|
|
949 return;
|
|
950
|
|
951 }
|
|
952 #### end if there is no mate info or nRP+uRP<minRP
|
|
953 if (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP))){
|
|
954 print "Something failed here\nif (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP)))\n";
|
|
955 return}
|
|
956
|
|
957 ##################################################################################
|
|
958 # Find out if mates have nearby soft-clips (to refine the breakpoints)
|
|
959 ##################################################################################
|
|
960 #Look for evidence of soft-clipping near mate
|
|
961 my @mate_soft_arr = ();
|
|
962 my $mate_start = 0;
|
|
963 my $mate_soft = "";
|
|
964
|
|
965 @mate_info_arr = split(/\t/, $mate_info);
|
|
966
|
|
967 #### mate start and end locations.
|
|
968 my $filename = $right_sc;
|
|
969
|
|
970 $start = $mate_info_arr[1] - $dist_To_Soft;
|
|
971 $end = $mate_info_arr[1];
|
|
972
|
|
973 if ($side =~ /right/) {
|
|
974 $start = $mate_info_arr[2];
|
|
975 $end = $mate_info_arr[2] + $dist_To_Soft;
|
|
976
|
|
977 $filename = $left_sc;
|
|
978 }
|
|
979
|
|
980 #### add levenstein distance to Summary
|
|
981 #print "Calc distal Levd\n";
|
|
982 $levD->search(rev_comp($plus_Reads[3]), $mate_info_arr[0], $start, $end, $INPUT_FASTA);
|
|
983 $distl_levD = sprintf("%.2f", $levD->{relative_edit_dist});
|
|
984 $distl_levD = "NA" if($plus_Reads[3] =~ /NA/);
|
|
985 #If there is no softclips to string match, then give 0 as quality value
|
|
986 if ($plus_Reads[3] !~ /NA/){
|
|
987 $QUAL=1/($distl_levD + 0.001);
|
|
988 }
|
|
989 else {
|
|
990 $QUAL=0;
|
|
991 };
|
|
992 $QUAL=sprintf("%.2f",$QUAL);
|
|
993 #### looking for softclips to refine break point
|
|
994 #### if left look in right and vice-versa.
|
|
995 $cmd = qq{echo -e "$mate_info_arr[0]\t$start\t$end"};
|
|
996 $cmd .= qq{ | awk -F'\t' 'NF==3' | intersectBed -a stdin -b $filename | head -1};
|
|
997 print "$cmd\n" if $verbose;
|
|
998 $mate_soft = `$cmd`;
|
|
999
|
|
1000 $mate_soft =~ s/\n//g;
|
|
1001 @mate_soft_arr = split(/\s/, $mate_soft);
|
|
1002 my $NO_MATE_SC="";
|
|
1003 if(@mate_soft_arr){
|
|
1004 $mate_chr = $mate_soft_arr[0];
|
|
1005 $mate_start = $mate_soft_arr[1];
|
|
1006 $NO_MATE_SC="APPROXIMATE";
|
|
1007
|
|
1008 } else{
|
|
1009 @mate_info_arr = split(/\s/,$mate_info);
|
|
1010 $mate_chr = $mate_info_arr[0];
|
|
1011 $mate_start = $mate_info_arr[1];
|
|
1012 }
|
|
1013
|
|
1014 #end if there is no mate info
|
|
1015 return if ($mate_chr eq "");
|
|
1016 #end if there is no mate info and !disable_RP_only
|
|
1017 return if (($lSC =~/NA/)&&(!$disable_RP_only));
|
|
1018
|
|
1019
|
|
1020 ###########################################################################
|
|
1021 ##### Printing output
|
|
1022
|
|
1023 #########################################
|
|
1024 # Get DNA info
|
|
1025 #########################################
|
|
1026 #print "PLUS_READS=$plus_Reads[0],$plus_Reads[1]\nMATE=$mate_chr,$mate_start,$INPUT_FASTA\n";
|
|
1027 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA);
|
|
1028
|
|
1029 ### this is alt ref. for softclip its $plus_Reads[3]
|
|
1030 $REF2_base = getSeq($mate_chr, $mate_start, $INPUT_FASTA);
|
|
1031
|
|
1032 #########################################
|
|
1033 # print in VCF format
|
|
1034 #########################################
|
|
1035
|
|
1036 #### abs value to account for left and right reads.
|
|
1037 $isize = abs($plus_Reads[1]-$mate_start);
|
|
1038
|
|
1039 my $event_type=$type;
|
|
1040 $event_type=~ s/,|[0-9]//g;
|
|
1041 $INFO_1=join(";", "$sv_type", "EVENT=$event_type","END=$mate_start", "ISIZE=$isize","MATEID=$BND2_name");
|
|
1042 $INFO_2=join(";", "$sv_type", "EVENT=$event_type","END=$plus_Reads[1]", "ISIZE=$isize","MATEID=$BND1_name");
|
|
1043
|
|
1044 #### remove any white spaces.
|
|
1045 #### ask: did you mean to remove space from ends? eg. trim()
|
|
1046 $INFO_1=~s/\s//g;
|
|
1047 $INFO_2=~s/\s//g;
|
|
1048
|
|
1049 $FORMAT=$summary;
|
|
1050 $FORMAT=~ s/,|[0-9]//g;
|
|
1051 $FORMAT .= ":lSC:nSC:uRP:distl_levD";
|
|
1052 if($NO_MATE_SC){$INFO_2 .= ":NO_MATE_SC"}
|
|
1053 my $SAMPLE="0/1:";
|
|
1054 $SAMPLE .=$summary;
|
|
1055 # if($NO_MATE_SC){$SAMPLE.= ":$NO_MATE_SC"}
|
|
1056
|
|
1057 $SAMPLE=~s/[A-Z|,|_]//g;
|
|
1058 my $MATE_SAMPLE=$SAMPLE;
|
|
1059 $SAMPLE .= ":$lSC:$nSC:$num_unmapped_pairs:$distl_levD";
|
|
1060 $MATE_SAMPLE .=":NA:NA:NA:NA";
|
|
1061 $SAMPLE=~s/::/:/g;
|
|
1062 $MATE_SAMPLE=~s/::/:/g;
|
|
1063 $MATE_SAMPLE=~s/NA/0/g;
|
|
1064 $SAMPLE=~s/NA/0/g;
|
|
1065
|
|
1066 if($type !~ /INV/){
|
|
1067 $ALT_1 = join("","]",$mate_chr,":",$mate_start,"]",$REF1_base);
|
|
1068 $ALT_2 = join("",$REF2_base,"[",$plus_Reads[0],":",$plus_Reads[1],"[");
|
|
1069 # 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND
|
|
1070 # 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND
|
|
1071 } else {
|
|
1072 $ALT_1 = join("", "]", $plus_Reads[0], ":", $plus_Reads[1], "]", $REF2_base);
|
|
1073 $ALT_2 = join("", $REF1_base, "[", $mate_chr, ":", $mate_start, "[");
|
|
1074 }
|
|
1075
|
|
1076 if(($mate_chr) && ($plus_Reads[0])){
|
|
1077 print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $ALT_1, $QUAL,"PASS", $INFO_1, $FORMAT,$SAMPLE,"\n");
|
|
1078 print OUT join ("\t", $mate_chr, $mate_start, $BND2_name, $REF2_base, $ALT_2, $QUAL, "PASS", $INFO_2, $FORMAT,$MATE_SAMPLE,"\n");
|
|
1079 }
|
|
1080 }
|
|
1081
|
|
1082 ###############################################################################
|
|
1083 ###############################################################################
|
|
1084 sub get_counts_n_info {
|
|
1085 my ($event, $side, $mapQ, $file, $dist, $target, $upL, $lwL) = @_;
|
|
1086
|
|
1087 my $mate_info = "";
|
|
1088 my $cmd = "";
|
|
1089
|
|
1090 if ($event =~ /^CTX$/i) {
|
|
1091 #print "CTX side $side\n";
|
|
1092 if ($side =~ /right/i) {
|
|
1093 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1536 $file $target};
|
|
1094 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'};
|
|
1095 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1096 #if($verbose){print "$cmd\n"}
|
|
1097 $mate_info=`$cmd`;
|
|
1098 } else {
|
|
1099 $cmd = qq{ samtools view $new_blacklist -q $mapQ -f 16 -F 1536 $file $target};
|
|
1100 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'};
|
|
1101 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1102 #if($verbose){print "$cmd\n"}
|
|
1103 $mate_info=`$cmd`;
|
|
1104 }
|
|
1105 } elsif ($event =~ /^DEL$/i) {
|
|
1106 #print "DEL side $side\n";
|
|
1107 if ($side =~ /right/i) {
|
|
1108 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
|
|
1109 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1110 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1111 #if($verbose){print "$cmd\n"}
|
|
1112 $mate_info=`$cmd`;
|
|
1113 } else {
|
|
1114 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1568 -f 16 $file $target};
|
|
1115 $cmd .= qq{ | awk '{OFS="\\t"} {if((\$7 ~ /=/)&&(\$9<-$upL)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1116 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1};
|
|
1117 #if($verbose){print "$cmd\n"}
|
|
1118
|
|
1119 $mate_info=`$cmd`;
|
|
1120 }
|
|
1121 } elsif ($event =~ /^INS$/i) {
|
|
1122 #print "INS side $side\n";
|
|
1123 if ($side =~ /right/i) {
|
|
1124 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
|
|
1125 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<$lwL && \$9 > 0 )){end=\$8+1;print \$3,\$8,end}}'};
|
|
1126 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1127 #if($verbose){print "$cmd\n"}
|
|
1128 $mate_info = `$cmd`;
|
|
1129 } else {
|
|
1130 $cmd = qq {samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target};
|
|
1131 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>-$lwL && \$9 < 0 )){end=\$8+1;print \$3,\$8,end}}'};
|
|
1132 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1133 #if($verbose){print "$cmd\n"}
|
|
1134 $mate_info = `$cmd`;
|
|
1135 }
|
|
1136 } elsif ($event =~ /^INV$/i) {
|
|
1137 #print "INV side $side\n";
|
|
1138 if ($side =~ /right/i) {
|
|
1139 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1596 $file $target};
|
|
1140 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1141 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1};
|
|
1142 #if($verbose){print "$cmd\n"}
|
|
1143 $mate_info = `$cmd`;
|
|
1144 } else {
|
|
1145 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 48 -F 1548 $file $target};
|
|
1146 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1147 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1148 #if($verbose){print "$cmd\n"}
|
|
1149 $mate_info = `$cmd`;
|
|
1150 }
|
|
1151 } elsif ($event =~ /^TDUP$/i) {
|
|
1152 #print "TDUP side $side\n";
|
|
1153 if ($side =~ /right/i) {
|
|
1154 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target};
|
|
1155 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1156 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4>\$8)&&(\$9<0)&& (\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1157 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1158 #if($verbose){print "$cmd\n"}
|
|
1159 $mate_info = `$cmd`;
|
|
1160 } else {
|
|
1161 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target};
|
|
1162 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<-$upL )){end=\$8+1;print \$3,\$8,end}}'};
|
|
1163 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4<\$8)&&(\$9>0)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'};
|
|
1164 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1165 #if($verbose){print "$cmd\n"}
|
|
1166 $mate_info = `$cmd`;
|
|
1167 }
|
|
1168 } elsif ($event =~ /^NOV_INS$/i) {
|
|
1169 #print "NOV_INS side $side\n";
|
|
1170 if ($side =~ /right/i) {
|
|
1171 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 8 -F 1552 $file $target};
|
|
1172 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'};
|
|
1173 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1174 #if($verbose){print "$cmd\n"}
|
|
1175 $mate_info = `$cmd`;
|
|
1176 } else {
|
|
1177 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 24 -F 1536 $file $target};
|
|
1178 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'};
|
|
1179 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1};
|
|
1180 #if($verbose){print "$cmd\n"}
|
|
1181 $mate_info = `$cmd`;
|
|
1182 }
|
|
1183 }
|
|
1184
|
|
1185 $mate_info=~s/\n//g;
|
|
1186 my @tmp=split(/\t/, $mate_info);
|
|
1187
|
|
1188 my $counts = 0;
|
|
1189
|
|
1190 if (defined $tmp[3]) {
|
|
1191 $tmp[3] =~ s/\n//g;
|
|
1192
|
|
1193 $counts = $tmp[3] if (length($tmp[3]));
|
|
1194 }
|
|
1195 return ({count=>$counts, info=>$mate_info});
|
|
1196 }
|