comparison 2.4/script/blat_parse.pl @ 16:8eb7d93f7e58 draft

Uploaded
author plus91-technologies-pvt-ltd
date Sat, 31 May 2014 11:23:36 -0400
parents e3609c8714fb
children
comparison
equal deleted inserted replaced
15:da93b6f4d684 16:8eb7d93f7e58
1 #####################################################################################################################################################
2 #Purpose: To parse blat psl file
3 #Date: 07-30-2013
4 #####################################################################################################################################################
5 use Getopt::Long;
6 use Cwd;
7 #reading input arguments
8 &Getopt::Long::GetOptions(
9 'b|BLAT_OUT=s'=> \$blat_out,
10 'temp:s'=>\$dirtemp,
11 'f|FASTA=s'=>\$infast,
12 );
13 $blat_out =~ s/\s|\t|\r|\n//g;
14 $dirtemp =~ s/\s|\t|\r|\n//g;
15 $infast =~ s/\s|\t|\r|\n//g;
16 $samtools=`which samtools`;
17 $samtools =~ s/\s|\t|\r|\n//g;
18
19 if($blat_out eq "" || $infast eq "" )
20 {
21 die "Try: perl blat_parse.pl -b <PSL FILE> -f <Contigs.fa>
22 -temp temporary file directory
23 \n";
24 }
25 if (!(-e $samtools))
26 {
27 die "samtools must be in your path\n";
28 }
29
30 if (!(-e $infast))
31 {
32 die "input fasta file doesn't exit\n";
33 }
34 unless(-d $dirtemp)
35 {
36 #system("mkdir -p $dirtemp");
37 $dirtemp= getcwd;
38 }
39 #opening the blat output file
40 open(BUFF,$blat_out) or die "no file found $blat_out\n";
41 open(WRBUFF,">$dirtemp/Temp_out.txt") or die "not able to write the file \n";
42 #parsing throught he file
43 while(<BUFF>)
44 {
45 if($_ =~ m/^\d/)
46 {
47 print WRBUFF $_;
48 }
49 else
50 {
51 print "ignoring headers $.\n";
52 }
53 }
54 close(WRBUFF);
55 system("sort -k10,10 -k18,18n $dirtemp/Temp_out.txt > $dirtemp/Temp_out1.txt");
56 system("mv $dirtemp/Temp_out1.txt $dirtemp/Temp_out.txt");
57 open(BUFF,"$dirtemp/Temp_out.txt") or die "no file found Temp_out.txt\n";
58 open(WRBUFF,">$dirtemp/File1_out.txt") or die "not able to write the file \n";
59 close(WRBUFF);
60
61 $prev_contig_name="";
62 my @temp;
63 #parsing throught he file
64 while(<BUFF>)
65 {
66
67 chomp($_);
68 split "\t";
69 if($_[9] ne $prev_contig_name)
70 {
71 if($prev_contig_name ne "")
72 {
73 #print @temp."\n";
74 #print @temp."\n";
75 &processing(@temp);
76 }
77 undef(@temp);
78 push(@temp,$_);
79 }
80 else
81 {
82 push(@temp,$_);
83 }
84 $prev_contig_name=$_[9];
85
86
87 }
88 #processing last record
89 &processing(@temp);
90 #print @temp."\n";
91 close(BUFF);
92
93
94
95
96 ##################SUBROUTINES######################
97 #actual processing of each record in the temp array(same query name objects)
98
99 sub processing {
100 open(WRBUFF,">>$dirtemp/File1_out.txt") or die "not able to write the file \n";
101 open(BAD_CONTIG,">>$dirtemp/bad_contig.out.txt") or die "not able to write the file \n";
102
103 @temp = @_;
104 #if number of hits for a contig is one
105 if(@temp == 1)
106 {
107 $i=0;
108 #define blocksizes array
109 @row=split("\t",$temp[$i]);
110 $row[18] =~ s/,$//g;
111 @blockSizes=split(',',$row[18]);
112 #defining var
113 $qSize=$row[10];
114 $qStart=$row[11];
115 $qStop=$row[12];
116 $tstart=$row[15];
117 $tstop=$row[16];
118 $Strand=$row[8];
119 $coverage = $row[9];
120 $coverage =~ s/\w+_//g;
121 #calculate match val
122 if(($qSize-($qStop-$qStart)) ==0)
123 {
124 $flag=1;
125 #these ara non informative
126 if (@blockSizes ==1)
127 {
128 print "ignoring one of the event $row[9] $i as the event is non informative \n";
129 print BAD_CONTIG "$row[9]\n";
130 }
131 #Ignoring when number of blocks are more than two
132 if(@blockSizes > 2)
133 {
134 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n";
135 }
136 #if number of blocks is equal to 2
137 if(@blockSizes == 2)
138 {
139 $temp1=$tstart+$blockSizes[0]+1;
140 $temp2=$tstop-$blockSizes[1]-1;
141
142 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n";
143 }
144 $i=@temp;
145 }
146 #later part missing
147 elsif($qStart ==0)
148 {
149 $temp1=$tstart+$blockSizes[0]+1;
150 $infast_chr=$infast;
151 $infast_chr=~ s/\.fa//g;
152 $infast_chr_start=$qStop+1;
153 $infast_chr_stop=$qSize;
154 $sys="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
155
156 $sys = `$sys`;
157 chomp($sys);
158 @sys=split("\n",$sys);
159 $INSERTION="";
160 for($i=1;$i<@sys;$i++)
161 {
162 $INSERTION=$INSERTION.$sys[$i];
163 }
164 $INSERTION_LENGTH=length($INSERTION);
165 $temp1=$tstart+$blockSizes[0]+1;
166 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\t$INSERTION\t$INSERTION_LENGTH\n";
167
168 }
169 #intial part missing
170 elsif($qStop == $qSize)
171 {
172 $temp1=$tstart;
173 $infast_chr=$infast;
174 $infast_chr=~ s/\.fa//g;
175 $infast_chr_start=0;
176 $infast_chr_stop=$qStart;
177 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
178 #die "$sys\n";
179 $sys = `$sys`;
180 #die "$sys\n";
181 chomp($sys);
182 @sys=split("\n",$sys);
183 $INSERTION="";
184 for( $i=1;$i<@sys;$i++)
185 {
186 $INSERTION=$INSERTION.$sys[$i];
187 }
188 $INSERTION_LENGTH=length($INSERTION);
189 $temp1=$tstart+1;
190 print WRBUFF "$row[9]\tUNKNOWN\tUNKNOWN\t$Strand\t$row[13]\t$temp1\t$Strand\t$coverage\n";
191
192 }
193 else
194 {
195 print "ignoring one of the event $row[9] $i as the event is non informative \n";
196 }
197
198 }
199 #if number of hits for a contig is greater than one
200 else
201 {
202 #this flag is used to see if perfect hit not found (match val =0)
203 $flag1 = 0;
204 for(my $i=0;$i<@temp;$i++)
205 {
206
207 #define blocksizes array
208 @row=split("\t",$temp[$i]);
209 $row[18] =~ s/,$//g;
210 @blockSizes=split(',',$row[18]);
211 #defining var
212 $qSize=$row[10];
213 $qStart=$row[11];
214 $qStop=$row[12];
215 $tstart=$row[15];
216 $tstop=$row[16];
217 $Strand=$row[8];
218 $coverage = $row[9];
219 $coverage =~ s/\w+_//g;
220 #calculate match val
221 if(($qSize-($qStop-$qStart)) ==0)
222 {
223 $flag1=1;
224 #these ara non informative
225 if (@blockSizes ==1)
226 {
227 print "ignoring one of the event $row[9] $i as the event is non informative \n";
228 print BAD_CONTIG "$row[9]\n";
229 }
230 #Ignoring when number of blocks are more than two
231 if(@blockSizes > 2)
232 {
233 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n";
234 }
235 if(@blockSizes == 2)
236 {
237 $temp1=$tstart+$blockSizes[0]+1;
238 $temp2=$tstop-$blockSizes[1]-1;
239
240 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n";
241 }
242 $i=@temp;
243 }
244 }
245 #as flag value not changed proceed to see next step
246 if($flag1 == 0)
247 {
248 undef(@initial);
249 my @initial;
250 for(my $i=0;$i<@temp;$i++)
251 {
252 @row=split("\t",$temp[$i]);
253 #print "@row\n";
254 unshift(@initial,[@row]);
255 }
256 #sortin the hits according to qstart & qend
257 @initial = sort {$a->[11] <=> $b->[11] || $b->[12] <=> $a->[12]} @initial;
258 #print "$row[9]\t@initial\n";
259 #if($row[9] eq "NODE_5_length_149_cov_12.395973")
260 #{
261 # for($i=0;$i<@initial;$i++)
262 # {
263 # print "@{$initial[$i]}\n";
264 # }
265 #}
266 $start = "";
267 $stop = "";
268 $start_len=0;
269 $stop_len=0;
270 #this super flag is used to skip processing of remaining uncessary hits
271 $super_flag = 0;
272 for($i=0;$i<@initial && $super_flag == 0;$i++)
273 {
274 $flag = 0;
275 #print "@{$initial[$i]}\n";
276 $initial[$i][18] =~ s/,$//g;
277 @blockSizes1=split(',',$initial[$i][18]);
278 #defining var
279 $qSize1=$initial[$i][10];
280 $qStart1=$initial[$i][11];
281 $qStop1=$initial[$i][12];
282 $tstart1=$initial[$i][15];
283 $tstop1=$initial[$i][16];
284 $Strand1=$initial[$i][8];
285 $Chr1 = $initial[$i][13];
286 $coverage1 = $initial[$i][9];
287 $coverage1 =~ s/\w+_//g;
288 #die "$qSize1\t$qStart1\t$qStop1\t$tstart1\t$tstop1\t$Strand1\t$Chr1\t$coverage1\n";
289 #if a hit qstart = 0 then set flag =1
290 if($qStart1 == 0)
291 {
292 $flag =1;
293 }
294 #if a hit qstop = 0 then set flag =2
295 if($qStop1 == $qSize1)
296 {
297 $flag =2;
298 }
299 #if($row[9] eq "NODE_5_length_149_cov_12.395973")
300 #{
301 # print "$flag \n";
302 #}
303 if(@blockSizes1 == 1)
304 {
305 if($flag == 1 )
306 {
307 for($j=0;$j<@initial;$j++)
308 {
309 #both hits should not be the same
310 if($i != $j)
311 {
312 #print "@{$initial[$i]}\n";
313 $initial[$j][18] =~ s/,$//g;
314 @blockSizes2=split(',',$initial[$j][18]);
315 #defining var
316 $qSize2=$initial[$j][10];
317 $qStart2=$initial[$j][11];
318 $qStop2=$initial[$j][12];
319 $tstart2=$initial[$j][15];
320 $tstop2=$initial[$j][16];
321 $Strand2=$initial[$j][8];
322 $coverage2 = $initial[$j][9];
323 $Chr2 = $initial[$j][13];
324 $coverage2 =~ s/\w+_//g;
325 #making sure both hits are not over lapping
326 if($qStart2 > $qStart1)
327 { #allowing +-2 bases as the this hit is immediate next continous hit
328 if($qStop1 >= $qStart2 -2 && $qStop1 <= $qStart2 +2 )
329 {
330 #perfect match
331 if($qStop2 == $qSize2)
332 {
333 if($Strand1 eq "+")
334 {
335 $tmp1 = $tstart1+$blockSizes1[0]+1;
336 $tmp2 = $tstart2+$blockSizes2[0];
337 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
338 }
339 else
340 {
341 $tmp1 = $tstart1+1;
342 $tmp2 = $tstart2+1;
343 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
344
345 }
346 $super_flag = 1;
347 $j = @initial+1;
348 }
349 #some part is missing after the second hit
350 else
351 {
352 $tmp1 = $tstart1+$blockSizes1[0];
353 $tmp2 = $tstart2+$blockSizes2[0];
354 $INSERTION="";
355 $infast_chr=$infast;
356 $infast_chr=~ s/\.fa//g;
357 $infast_chr_start=$qStop1+1;
358 $infast_chr_stop=$qStart2-1;
359 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
360 #die "$sys\n";
361 $sys = `$sys`;
362 #die "$sys\n";
363 chomp($sys);
364 @sys=split("\n",$sys);
365 for( $i=1;$i<@sys;$i++)
366 {
367 $INSERTION=$INSERTION.$sys[$i];
368 }
369 $INSERTION_LENGTH=length($INSERTION);
370 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
371 $super_flag = 1;
372 $j = @initial+1;
373 }
374
375 }
376 #if there are some insertion between two hits
377 elsif($qStop2 == $qSize2)
378 {
379 $tmp1 = $tstart1+$blockSizes1[0];
380 $tmp2 = $tstart2+$blockSizes2[0];
381 $INSERTION="";
382 $infast_chr=$infast;
383 $infast_chr=~ s/\.fa//g;
384 $infast_chr_start=$qStop2+1;
385 $infast_chr_stop=$qSize;
386 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
387 #die "$sys\n";
388 $sys = `$sys`;
389 #die "$sys\n";
390 chomp($sys);
391 @sys=split("\n",$sys);
392 for( $i=1;$i<@sys;$i++)
393 {
394 $INSERTION=$INSERTION.$sys[$i];
395 }
396 $INSERTION_LENGTH=length($INSERTION);
397 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n";
398 $super_flag = 1;
399 $j = @initial+1;
400 }
401
402 }
403
404 }
405 }
406 #if none worked with other reads then only process that read
407 if($j == @initial)
408 {
409 #die "success\n";
410 $temp1=$tstart1+$blockSizes1[0]+1;
411 #print WRBUFF "$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\n";
412 $infast_chr=$infast;
413 $infast_chr=~ s/\.fa//g;
414 $infast_chr_start=$qStop1+1;
415 $infast_chr_stop=$qSize1;
416 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
417 #die "$sys\n";
418 $sys = `$sys`;
419 #die "$sys\n";
420 chomp($sys);
421 @sys=split("\n",$sys);
422 $INSERTION="";
423 for( $i=1;$i<@sys;$i++)
424 {
425 $INSERTION=$INSERTION.$sys[$i];
426 }
427 $INSERTION_LENGTH=length($INSERTION);
428 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand1\t$coverage1\n";
429 $super_flag = 1;
430 }
431 }
432 #if query end is matched to query size
433 elsif($flag == 2)
434 {
435 #going through other hits
436 for($j=0;$j<@initial;$j++)
437 {
438 #hits should not be same
439 if($i != $j && $qStop2)
440 {
441 #print "@{$initial[$i]}\n";
442 $initial[$j][18] =~ s/,$//g;
443 @blockSizes2=split(',',$initial[$j][18]);
444 #defining var
445 $qSize2=$initial[$j][10];
446 $qStart2=$initial[$j][11];
447 $qStop2=$initial[$j][12];
448 $tstart2=$initial[$j][15];
449 $tstop2=$initial[$j][16];
450 $Strand2=$initial[$j][8];
451 $coverage2 = $initial[$j][9];
452 $Chr2 = $initial[$j][13];
453 $coverage2 =~ s/\w+_//g;
454 #if
455 if($qStop2 < $qStop1)
456 {
457 if($qStart1 >= $qStop2 -2 && $qStart1 <= $qStop2 +2 )
458 {
459 #die "$qStart1 <= $qStop2 \n";
460 $tmp1 = $tstart1+$blockSizes1[0];
461 $tmp2 = $tstart2+$blockSizes2[0];
462 $INSERTION="";
463 $infast_chr=$infast;
464 $infast_chr=~ s/\.fa//g;
465 $infast_chr_start=0;
466 $infast_chr_stop=$qStart1-1;
467 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
468 #die "test $sys\n";
469 $sys = `$sys`;
470 #die "$sys\n";
471 chomp($sys);
472 @sys=split("\n",$sys);
473 for( $i=1;$i<@sys;$i++)
474 {
475 $INSERTION=$INSERTION.$sys[$i];
476 }
477 $INSERTION_LENGTH=length($INSERTION);
478 print WRBUFF "$initial[$i][9]\t$Chr2\t$tmp2\t$Strand2\t$Chr1\t$tmp1\t$Strand1\t$coverage1\n";
479 $super_flag = 1;
480 $j = @initial+1;
481
482 }
483
484 }
485 }
486 }
487 if($j == @initial)
488 {
489 $infast_chr=$infast;
490 $infast_chr=~ s/\.fa//g;
491 $infast_chr_start=0;
492 $infast_chr_stop=$qStart1;
493 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop";
494 #die "test $sys\n";
495 $sys = `$sys`;
496 #die "$sys\n";
497 chomp($sys);
498 @sys=split("\n",$sys);
499 $INSERTION="";
500 for( $i=1;$i<@sys;$i++)
501 {
502 $INSERTION=$INSERTION.$sys[$i];
503 }
504 $INSERTION_LENGTH=length($INSERTION);
505 $tmp = $tstart1+1;
506 print WRBUFF "$initial[$i][9]\tUNKNOWN\tUNKNOWN\t$Strand1\t$Chr1\t$tmp\t$Strand1\t$coverage1\n";
507 $super_flag = 1;
508 }
509 }
510 }
511 elsif(@blockSizes == 2)
512 {
513 $temp1=$tstart1+$blockSizes[0]+1;
514 $temp2=$tstop1-$blockSizes[1]-1;
515 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\t$Chr1\t$temp2\t$Strand1\t$coverage1\n";
516
517 }
518 }
519 }
520
521 }
522 close(WRBUFF);
523
524 undef(@temp);
525 }
526