13
|
1 #####################################################################################################################################################
|
|
2 #Purpose: To parse blat psl file
|
|
3 #Date: 07-30-2013
|
|
4 #####################################################################################################################################################
|
|
5 use Getopt::Long;
|
|
6 use Cwd;
|
|
7 #reading input arguments
|
|
8 &Getopt::Long::GetOptions(
|
|
9 'b|BLAT_OUT=s'=> \$blat_out,
|
|
10 'temp:s'=>\$dirtemp,
|
|
11 'f|FASTA=s'=>\$infast,
|
|
12 );
|
|
13 $blat_out =~ s/\s|\t|\r|\n//g;
|
|
14 $dirtemp =~ s/\s|\t|\r|\n//g;
|
|
15 $infast =~ s/\s|\t|\r|\n//g;
|
|
16 $samtools=`which samtools`;
|
|
17 $samtools =~ s/\s|\t|\r|\n//g;
|
|
18
|
|
19 if($blat_out eq "" || $infast eq "" )
|
|
20 {
|
|
21 die "Try: perl blat_parse.pl -b <PSL FILE> -f <Contigs.fa>
|
|
22 -temp temporary file directory
|
|
23 \n";
|
|
24 }
|
|
25 if (!(-e $samtools))
|
|
26 {
|
|
27 die "samtools must be in your path\n";
|
|
28 }
|
|
29
|
|
30 if (!(-e $infast))
|
|
31 {
|
|
32 die "input fasta file doesn't exit\n";
|
|
33 }
|
|
34 unless(-d $dirtemp)
|
|
35 {
|
|
36 #system("mkdir -p $dirtemp");
|
|
37 $dirtemp= getcwd;
|
|
38 }
|
|
39 #opening the blat output file
|
|
40 open(BUFF,$blat_out) or die "no file found $blat_out\n";
|
|
41 open(WRBUFF,">$dirtemp/Temp_out.txt") or die "not able to write the file \n";
|
|
42 #parsing throught he file
|
|
43 while(<BUFF>)
|
|
44 {
|
|
45 if($_ =~ m/^\d/)
|
|
46 {
|
|
47 print WRBUFF $_;
|
|
48 }
|
|
49 else
|
|
50 {
|
|
51 print "ignoring headers $.\n";
|
|
52 }
|
|
53 }
|
|
54 close(WRBUFF);
|
|
55 system("sort -k10,10 -k18,18n $dirtemp/Temp_out.txt > $dirtemp/Temp_out1.txt");
|
|
56 system("mv $dirtemp/Temp_out1.txt $dirtemp/Temp_out.txt");
|
|
57 open(BUFF,"$dirtemp/Temp_out.txt") or die "no file found Temp_out.txt\n";
|
|
58 open(WRBUFF,">$dirtemp/File1_out.txt") or die "not able to write the file \n";
|
|
59 close(WRBUFF);
|
|
60
|
|
61 $prev_contig_name="";
|
|
62 my @temp;
|
|
63 #parsing throught he file
|
|
64 while(<BUFF>)
|
|
65 {
|
|
66
|
|
67 chomp($_);
|
|
68 split "\t";
|
|
69 if($_[9] ne $prev_contig_name)
|
|
70 {
|
|
71 if($prev_contig_name ne "")
|
|
72 {
|
|
73 #print @temp."\n";
|
|
74 #print @temp."\n";
|
|
75 &processing(@temp);
|
|
76 }
|
|
77 undef(@temp);
|
|
78 push(@temp,$_);
|
|
79 }
|
|
80 else
|
|
81 {
|
|
82 push(@temp,$_);
|
|
83 }
|
|
84 $prev_contig_name=$_[9];
|
|
85
|
|
86
|
|
87 }
|
|
88 #processing last record
|
|
89 &processing(@temp);
|
|
90 #print @temp."\n";
|
|
91 close(BUFF);
|
|
92
|
|
93
|
|
94
|
|
95
|
|
96 ##################SUBROUTINES######################
|
|
97 #actual processing of each record in the temp array(same query name objects)
|
|
98
|
|
99 sub processing {
|
|
100 open(WRBUFF,">>$dirtemp/File1_out.txt") or die "not able to write the file \n";
|
|
101 open(BAD_CONTIG,">>$dirtemp/bad_contig.out.txt") or die "not able to write the file \n";
|
|
102
|
|
103 @temp = @_;
|
|
104 #if number of hits for a contig is one
|
|
105 if(@temp == 1)
|
|
106 {
|
|
107 $i=0;
|
|
108 #define blocksizes array
|
|
109 @row=split("\t",$temp[$i]);
|
|
110 $row[18] =~ s/,$//g;
|
|
111 @blockSizes=split(',',$row[18]);
|
|
112 #defining var
|
|
113 $qSize=$row[10];
|
|
114 $qStart=$row[11];
|
|
115 $qStop=$row[12];
|
|
116 $tstart=$row[15];
|
|
117 $tstop=$row[16];
|
|
118 $Strand=$row[8];
|
|
119 $coverage = $row[9];
|
|
120 $coverage =~ s/\w+_//g;
|
|
121 #calculate match val
|
|
122 if(($qSize-($qStop-$qStart)) ==0)
|
|
123 {
|
|
124 $flag=1;
|
|
125 #these ara non informative
|
|
126 if (@blockSizes ==1)
|
|
127 {
|
|
128 print "ignoring one of the event $row[9] $i as the event is non informative \n";
|
|
129 print BAD_CONTIG "$row[9]\n";
|
|
130 }
|
|
131 #Ignoring when number of blocks are more than two
|
|
132 if(@blockSizes > 2)
|
|
133 {
|
|
134 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n";
|
|
135 }
|
|
136 #if number of blocks is equal to 2
|
|
137 if(@blockSizes == 2)
|
|
138 {
|
|
139 $temp1=$tstart+$blockSizes[0]+1;
|
|
140 $temp2=$tstop-$blockSizes[1]-1;
|
|
141
|
|
142 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n";
|
|
143 }
|
|
144 $i=@temp;
|
|
145 }
|
|
146 #later part missing
|
|
147 elsif($qStart ==0)
|
|
148 {
|
|
149 $temp1=$tstart+$blockSizes[0]+1;
|
|
150 $infast_chr=$infast;
|
|
151 $infast_chr=~ s/\.fa//g;
|
|
152 $infast_chr_start=$qStop+1;
|
|
153 $infast_chr_stop=$qSize;
|
|
154 $sys="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
155
|
|
156 $sys = `$sys`;
|
|
157 chomp($sys);
|
|
158 @sys=split("\n",$sys);
|
|
159 $INSERTION="";
|
|
160 for($i=1;$i<@sys;$i++)
|
|
161 {
|
|
162 $INSERTION=$INSERTION.$sys[$i];
|
|
163 }
|
|
164 $INSERTION_LENGTH=length($INSERTION);
|
|
165 $temp1=$tstart+$blockSizes[0]+1;
|
|
166 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\t$INSERTION\t$INSERTION_LENGTH\n";
|
|
167
|
|
168 }
|
|
169 #intial part missing
|
|
170 elsif($qStop == $qSize)
|
|
171 {
|
|
172 $temp1=$tstart;
|
|
173 $infast_chr=$infast;
|
|
174 $infast_chr=~ s/\.fa//g;
|
|
175 $infast_chr_start=0;
|
|
176 $infast_chr_stop=$qStart;
|
|
177 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
178 #die "$sys\n";
|
|
179 $sys = `$sys`;
|
|
180 #die "$sys\n";
|
|
181 chomp($sys);
|
|
182 @sys=split("\n",$sys);
|
|
183 $INSERTION="";
|
|
184 for( $i=1;$i<@sys;$i++)
|
|
185 {
|
|
186 $INSERTION=$INSERTION.$sys[$i];
|
|
187 }
|
|
188 $INSERTION_LENGTH=length($INSERTION);
|
|
189 $temp1=$tstart+1;
|
|
190 print WRBUFF "$row[9]\tUNKNOWN\tUNKNOWN\t$Strand\t$row[13]\t$temp1\t$Strand\t$coverage\n";
|
|
191
|
|
192 }
|
|
193 else
|
|
194 {
|
|
195 print "ignoring one of the event $row[9] $i as the event is non informative \n";
|
|
196 }
|
|
197
|
|
198 }
|
|
199 #if number of hits for a contig is greater than one
|
|
200 else
|
|
201 {
|
|
202 #this flag is used to see if perfect hit not found (match val =0)
|
|
203 $flag1 = 0;
|
|
204 for(my $i=0;$i<@temp;$i++)
|
|
205 {
|
|
206
|
|
207 #define blocksizes array
|
|
208 @row=split("\t",$temp[$i]);
|
|
209 $row[18] =~ s/,$//g;
|
|
210 @blockSizes=split(',',$row[18]);
|
|
211 #defining var
|
|
212 $qSize=$row[10];
|
|
213 $qStart=$row[11];
|
|
214 $qStop=$row[12];
|
|
215 $tstart=$row[15];
|
|
216 $tstop=$row[16];
|
|
217 $Strand=$row[8];
|
|
218 $coverage = $row[9];
|
|
219 $coverage =~ s/\w+_//g;
|
|
220 #calculate match val
|
|
221 if(($qSize-($qStop-$qStart)) ==0)
|
|
222 {
|
|
223 $flag1=1;
|
|
224 #these ara non informative
|
|
225 if (@blockSizes ==1)
|
|
226 {
|
|
227 print "ignoring one of the event $row[9] $i as the event is non informative \n";
|
|
228 print BAD_CONTIG "$row[9]\n";
|
|
229 }
|
|
230 #Ignoring when number of blocks are more than two
|
|
231 if(@blockSizes > 2)
|
|
232 {
|
|
233 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n";
|
|
234 }
|
|
235 if(@blockSizes == 2)
|
|
236 {
|
|
237 $temp1=$tstart+$blockSizes[0]+1;
|
|
238 $temp2=$tstop-$blockSizes[1]-1;
|
|
239
|
|
240 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n";
|
|
241 }
|
|
242 $i=@temp;
|
|
243 }
|
|
244 }
|
|
245 #as flag value not changed proceed to see next step
|
|
246 if($flag1 == 0)
|
|
247 {
|
|
248 undef(@initial);
|
|
249 my @initial;
|
|
250 for(my $i=0;$i<@temp;$i++)
|
|
251 {
|
|
252 @row=split("\t",$temp[$i]);
|
|
253 #print "@row\n";
|
|
254 unshift(@initial,[@row]);
|
|
255 }
|
|
256 #sortin the hits according to qstart & qend
|
|
257 @initial = sort {$a->[11] <=> $b->[11] || $b->[12] <=> $a->[12]} @initial;
|
|
258 #print "$row[9]\t@initial\n";
|
|
259 #if($row[9] eq "NODE_5_length_149_cov_12.395973")
|
|
260 #{
|
|
261 # for($i=0;$i<@initial;$i++)
|
|
262 # {
|
|
263 # print "@{$initial[$i]}\n";
|
|
264 # }
|
|
265 #}
|
|
266 $start = "";
|
|
267 $stop = "";
|
|
268 $start_len=0;
|
|
269 $stop_len=0;
|
|
270 #this super flag is used to skip processing of remaining uncessary hits
|
|
271 $super_flag = 0;
|
|
272 for($i=0;$i<@initial && $super_flag == 0;$i++)
|
|
273 {
|
|
274 $flag = 0;
|
|
275 #print "@{$initial[$i]}\n";
|
|
276 $initial[$i][18] =~ s/,$//g;
|
|
277 @blockSizes1=split(',',$initial[$i][18]);
|
|
278 #defining var
|
|
279 $qSize1=$initial[$i][10];
|
|
280 $qStart1=$initial[$i][11];
|
|
281 $qStop1=$initial[$i][12];
|
|
282 $tstart1=$initial[$i][15];
|
|
283 $tstop1=$initial[$i][16];
|
|
284 $Strand1=$initial[$i][8];
|
|
285 $Chr1 = $initial[$i][13];
|
|
286 $coverage1 = $initial[$i][9];
|
|
287 $coverage1 =~ s/\w+_//g;
|
|
288 #die "$qSize1\t$qStart1\t$qStop1\t$tstart1\t$tstop1\t$Strand1\t$Chr1\t$coverage1\n";
|
|
289 #if a hit qstart = 0 then set flag =1
|
|
290 if($qStart1 == 0)
|
|
291 {
|
|
292 $flag =1;
|
|
293 }
|
|
294 #if a hit qstop = 0 then set flag =2
|
|
295 if($qStop1 == $qSize1)
|
|
296 {
|
|
297 $flag =2;
|
|
298 }
|
|
299 #if($row[9] eq "NODE_5_length_149_cov_12.395973")
|
|
300 #{
|
|
301 # print "$flag \n";
|
|
302 #}
|
|
303 if(@blockSizes1 == 1)
|
|
304 {
|
|
305 if($flag == 1 )
|
|
306 {
|
|
307 for($j=0;$j<@initial;$j++)
|
|
308 {
|
|
309 #both hits should not be the same
|
|
310 if($i != $j)
|
|
311 {
|
|
312 #print "@{$initial[$i]}\n";
|
|
313 $initial[$j][18] =~ s/,$//g;
|
|
314 @blockSizes2=split(',',$initial[$j][18]);
|
|
315 #defining var
|
|
316 $qSize2=$initial[$j][10];
|
|
317 $qStart2=$initial[$j][11];
|
|
318 $qStop2=$initial[$j][12];
|
|
319 $tstart2=$initial[$j][15];
|
|
320 $tstop2=$initial[$j][16];
|
|
321 $Strand2=$initial[$j][8];
|
|
322 $coverage2 = $initial[$j][9];
|
|
323 $Chr2 = $initial[$j][13];
|
|
324 $coverage2 =~ s/\w+_//g;
|
|
325 #making sure both hits are not over lapping
|
|
326 if($qStart2 > $qStart1)
|
|
327 { #allowing +-2 bases as the this hit is immediate next continous hit
|
|
328 if($qStop1 >= $qStart2 -2 && $qStop1 <= $qStart2 +2 )
|
|
329 {
|
|
330 #perfect match
|
|
331 if($qStop2 == $qSize2)
|
|
332 {
|
|
333 if($Strand1 eq "+")
|
|
334 {
|
|
335 $tmp1 = $tstart1+$blockSizes1[0]+1;
|
|
336 $tmp2 = $tstart2+$blockSizes2[0];
|
|
337 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
|
|
338 }
|
|
339 else
|
|
340 {
|
|
341 $tmp1 = $tstart1+1;
|
|
342 $tmp2 = $tstart2+1;
|
|
343 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
|
|
344
|
|
345 }
|
|
346 $super_flag = 1;
|
|
347 $j = @initial+1;
|
|
348 }
|
|
349 #some part is missing after the second hit
|
|
350 else
|
|
351 {
|
|
352 $tmp1 = $tstart1+$blockSizes1[0];
|
|
353 $tmp2 = $tstart2+$blockSizes2[0];
|
|
354 $INSERTION="";
|
|
355 $infast_chr=$infast;
|
|
356 $infast_chr=~ s/\.fa//g;
|
|
357 $infast_chr_start=$qStop1+1;
|
|
358 $infast_chr_stop=$qStart2-1;
|
|
359 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
360 #die "$sys\n";
|
|
361 $sys = `$sys`;
|
|
362 #die "$sys\n";
|
|
363 chomp($sys);
|
|
364 @sys=split("\n",$sys);
|
|
365 for( $i=1;$i<@sys;$i++)
|
|
366 {
|
|
367 $INSERTION=$INSERTION.$sys[$i];
|
|
368 }
|
|
369 $INSERTION_LENGTH=length($INSERTION);
|
|
370 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
|
|
371 $super_flag = 1;
|
|
372 $j = @initial+1;
|
|
373 }
|
|
374
|
|
375 }
|
|
376 #if there are some insertion between two hits
|
|
377 elsif($qStop2 == $qSize2)
|
|
378 {
|
|
379 $tmp1 = $tstart1+$blockSizes1[0];
|
|
380 $tmp2 = $tstart2+$blockSizes2[0];
|
|
381 $INSERTION="";
|
|
382 $infast_chr=$infast;
|
|
383 $infast_chr=~ s/\.fa//g;
|
|
384 $infast_chr_start=$qStop2+1;
|
|
385 $infast_chr_stop=$qSize;
|
|
386 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
387 #die "$sys\n";
|
|
388 $sys = `$sys`;
|
|
389 #die "$sys\n";
|
|
390 chomp($sys);
|
|
391 @sys=split("\n",$sys);
|
|
392 for( $i=1;$i<@sys;$i++)
|
|
393 {
|
|
394 $INSERTION=$INSERTION.$sys[$i];
|
|
395 }
|
|
396 $INSERTION_LENGTH=length($INSERTION);
|
|
397 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
|
|
398 $super_flag = 1;
|
|
399 $j = @initial+1;
|
|
400 }
|
|
401
|
|
402 }
|
|
403
|
|
404 }
|
|
405 }
|
|
406 #if none worked with other reads then only process that read
|
|
407 if($j == @initial)
|
|
408 {
|
|
409 #die "success\n";
|
|
410 $temp1=$tstart1+$blockSizes1[0]+1;
|
|
411 #print WRBUFF "$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\n";
|
|
412 $infast_chr=$infast;
|
|
413 $infast_chr=~ s/\.fa//g;
|
|
414 $infast_chr_start=$qStop1+1;
|
|
415 $infast_chr_stop=$qSize1;
|
|
416 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
417 #die "$sys\n";
|
|
418 $sys = `$sys`;
|
|
419 #die "$sys\n";
|
|
420 chomp($sys);
|
|
421 @sys=split("\n",$sys);
|
|
422 $INSERTION="";
|
|
423 for( $i=1;$i<@sys;$i++)
|
|
424 {
|
|
425 $INSERTION=$INSERTION.$sys[$i];
|
|
426 }
|
|
427 $INSERTION_LENGTH=length($INSERTION);
|
|
428 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand1\t$coverage1\n";
|
|
429 $super_flag = 1;
|
|
430 }
|
|
431 }
|
|
432 #if query end is matched to query size
|
|
433 elsif($flag == 2)
|
|
434 {
|
|
435 #going through other hits
|
|
436 for($j=0;$j<@initial;$j++)
|
|
437 {
|
|
438 #hits should not be same
|
|
439 if($i != $j && $qStop2)
|
|
440 {
|
|
441 #print "@{$initial[$i]}\n";
|
|
442 $initial[$j][18] =~ s/,$//g;
|
|
443 @blockSizes2=split(',',$initial[$j][18]);
|
|
444 #defining var
|
|
445 $qSize2=$initial[$j][10];
|
|
446 $qStart2=$initial[$j][11];
|
|
447 $qStop2=$initial[$j][12];
|
|
448 $tstart2=$initial[$j][15];
|
|
449 $tstop2=$initial[$j][16];
|
|
450 $Strand2=$initial[$j][8];
|
|
451 $coverage2 = $initial[$j][9];
|
|
452 $Chr2 = $initial[$j][13];
|
|
453 $coverage2 =~ s/\w+_//g;
|
|
454 #if
|
|
455 if($qStop2 < $qStop1)
|
|
456 {
|
|
457 if($qStart1 >= $qStop2 -2 && $qStart1 <= $qStop2 +2 )
|
|
458 {
|
|
459 #die "$qStart1 <= $qStop2 \n";
|
|
460 $tmp1 = $tstart1+$blockSizes1[0];
|
|
461 $tmp2 = $tstart2+$blockSizes2[0];
|
|
462 $INSERTION="";
|
|
463 $infast_chr=$infast;
|
|
464 $infast_chr=~ s/\.fa//g;
|
|
465 $infast_chr_start=0;
|
|
466 $infast_chr_stop=$qStart1-1;
|
|
467 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
468 #die "test $sys\n";
|
|
469 $sys = `$sys`;
|
|
470 #die "$sys\n";
|
|
471 chomp($sys);
|
|
472 @sys=split("\n",$sys);
|
|
473 for( $i=1;$i<@sys;$i++)
|
|
474 {
|
|
475 $INSERTION=$INSERTION.$sys[$i];
|
|
476 }
|
|
477 $INSERTION_LENGTH=length($INSERTION);
|
|
478 print WRBUFF "$initial[$i][9]\t$Chr2\t$tmp2\t$Strand2\t$Chr1\t$tmp1\t$Strand1\t$coverage1\n";
|
|
479 $super_flag = 1;
|
|
480 $j = @initial+1;
|
|
481
|
|
482 }
|
|
483
|
|
484 }
|
|
485 }
|
|
486 }
|
|
487 if($j == @initial)
|
|
488 {
|
|
489 $infast_chr=$infast;
|
|
490 $infast_chr=~ s/\.fa//g;
|
|
491 $infast_chr_start=0;
|
|
492 $infast_chr_stop=$qStart1;
|
|
493 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
|
|
494 #die "test $sys\n";
|
|
495 $sys = `$sys`;
|
|
496 #die "$sys\n";
|
|
497 chomp($sys);
|
|
498 @sys=split("\n",$sys);
|
|
499 $INSERTION="";
|
|
500 for( $i=1;$i<@sys;$i++)
|
|
501 {
|
|
502 $INSERTION=$INSERTION.$sys[$i];
|
|
503 }
|
|
504 $INSERTION_LENGTH=length($INSERTION);
|
|
505 $tmp = $tstart1+1;
|
|
506 print WRBUFF "$initial[$i][9]\tUNKNOWN\tUNKNOWN\t$Strand1\t$Chr1\t$tmp\t$Strand1\t$coverage1\n";
|
|
507 $super_flag = 1;
|
|
508 }
|
|
509 }
|
|
510 }
|
|
511 elsif(@blockSizes == 2)
|
|
512 {
|
|
513 $temp1=$tstart1+$blockSizes[0]+1;
|
|
514 $temp2=$tstop1-$blockSizes[1]-1;
|
|
515 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\t$Chr1\t$temp2\t$Strand1\t$coverage1\n";
|
|
516
|
|
517 }
|
|
518 }
|
|
519 }
|
|
520
|
|
521 }
|
|
522 close(WRBUFF);
|
|
523
|
|
524 undef(@temp);
|
|
525 }
|
|
526
|