Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison 2.4/script/blat_parse.pl @ 16:8eb7d93f7e58 draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Sat, 31 May 2014 11:23:36 -0400 |
parents | e3609c8714fb |
children |
comparison
equal
deleted
inserted
replaced
15:da93b6f4d684 | 16:8eb7d93f7e58 |
---|---|
1 ##################################################################################################################################################### | |
2 #Purpose: To parse blat psl file | |
3 #Date: 07-30-2013 | |
4 ##################################################################################################################################################### | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 #reading input arguments | |
8 &Getopt::Long::GetOptions( | |
9 'b|BLAT_OUT=s'=> \$blat_out, | |
10 'temp:s'=>\$dirtemp, | |
11 'f|FASTA=s'=>\$infast, | |
12 ); | |
13 $blat_out =~ s/\s|\t|\r|\n//g; | |
14 $dirtemp =~ s/\s|\t|\r|\n//g; | |
15 $infast =~ s/\s|\t|\r|\n//g; | |
16 $samtools=`which samtools`; | |
17 $samtools =~ s/\s|\t|\r|\n//g; | |
18 | |
19 if($blat_out eq "" || $infast eq "" ) | |
20 { | |
21 die "Try: perl blat_parse.pl -b <PSL FILE> -f <Contigs.fa> | |
22 -temp temporary file directory | |
23 \n"; | |
24 } | |
25 if (!(-e $samtools)) | |
26 { | |
27 die "samtools must be in your path\n"; | |
28 } | |
29 | |
30 if (!(-e $infast)) | |
31 { | |
32 die "input fasta file doesn't exit\n"; | |
33 } | |
34 unless(-d $dirtemp) | |
35 { | |
36 #system("mkdir -p $dirtemp"); | |
37 $dirtemp= getcwd; | |
38 } | |
39 #opening the blat output file | |
40 open(BUFF,$blat_out) or die "no file found $blat_out\n"; | |
41 open(WRBUFF,">$dirtemp/Temp_out.txt") or die "not able to write the file \n"; | |
42 #parsing throught he file | |
43 while(<BUFF>) | |
44 { | |
45 if($_ =~ m/^\d/) | |
46 { | |
47 print WRBUFF $_; | |
48 } | |
49 else | |
50 { | |
51 print "ignoring headers $.\n"; | |
52 } | |
53 } | |
54 close(WRBUFF); | |
55 system("sort -k10,10 -k18,18n $dirtemp/Temp_out.txt > $dirtemp/Temp_out1.txt"); | |
56 system("mv $dirtemp/Temp_out1.txt $dirtemp/Temp_out.txt"); | |
57 open(BUFF,"$dirtemp/Temp_out.txt") or die "no file found Temp_out.txt\n"; | |
58 open(WRBUFF,">$dirtemp/File1_out.txt") or die "not able to write the file \n"; | |
59 close(WRBUFF); | |
60 | |
61 $prev_contig_name=""; | |
62 my @temp; | |
63 #parsing throught he file | |
64 while(<BUFF>) | |
65 { | |
66 | |
67 chomp($_); | |
68 split "\t"; | |
69 if($_[9] ne $prev_contig_name) | |
70 { | |
71 if($prev_contig_name ne "") | |
72 { | |
73 #print @temp."\n"; | |
74 #print @temp."\n"; | |
75 &processing(@temp); | |
76 } | |
77 undef(@temp); | |
78 push(@temp,$_); | |
79 } | |
80 else | |
81 { | |
82 push(@temp,$_); | |
83 } | |
84 $prev_contig_name=$_[9]; | |
85 | |
86 | |
87 } | |
88 #processing last record | |
89 &processing(@temp); | |
90 #print @temp."\n"; | |
91 close(BUFF); | |
92 | |
93 | |
94 | |
95 | |
96 ##################SUBROUTINES###################### | |
97 #actual processing of each record in the temp array(same query name objects) | |
98 | |
99 sub processing { | |
100 open(WRBUFF,">>$dirtemp/File1_out.txt") or die "not able to write the file \n"; | |
101 open(BAD_CONTIG,">>$dirtemp/bad_contig.out.txt") or die "not able to write the file \n"; | |
102 | |
103 @temp = @_; | |
104 #if number of hits for a contig is one | |
105 if(@temp == 1) | |
106 { | |
107 $i=0; | |
108 #define blocksizes array | |
109 @row=split("\t",$temp[$i]); | |
110 $row[18] =~ s/,$//g; | |
111 @blockSizes=split(',',$row[18]); | |
112 #defining var | |
113 $qSize=$row[10]; | |
114 $qStart=$row[11]; | |
115 $qStop=$row[12]; | |
116 $tstart=$row[15]; | |
117 $tstop=$row[16]; | |
118 $Strand=$row[8]; | |
119 $coverage = $row[9]; | |
120 $coverage =~ s/\w+_//g; | |
121 #calculate match val | |
122 if(($qSize-($qStop-$qStart)) ==0) | |
123 { | |
124 $flag=1; | |
125 #these ara non informative | |
126 if (@blockSizes ==1) | |
127 { | |
128 print "ignoring one of the event $row[9] $i as the event is non informative \n"; | |
129 print BAD_CONTIG "$row[9]\n"; | |
130 } | |
131 #Ignoring when number of blocks are more than two | |
132 if(@blockSizes > 2) | |
133 { | |
134 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n"; | |
135 } | |
136 #if number of blocks is equal to 2 | |
137 if(@blockSizes == 2) | |
138 { | |
139 $temp1=$tstart+$blockSizes[0]+1; | |
140 $temp2=$tstop-$blockSizes[1]-1; | |
141 | |
142 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n"; | |
143 } | |
144 $i=@temp; | |
145 } | |
146 #later part missing | |
147 elsif($qStart ==0) | |
148 { | |
149 $temp1=$tstart+$blockSizes[0]+1; | |
150 $infast_chr=$infast; | |
151 $infast_chr=~ s/\.fa//g; | |
152 $infast_chr_start=$qStop+1; | |
153 $infast_chr_stop=$qSize; | |
154 $sys="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
155 | |
156 $sys = `$sys`; | |
157 chomp($sys); | |
158 @sys=split("\n",$sys); | |
159 $INSERTION=""; | |
160 for($i=1;$i<@sys;$i++) | |
161 { | |
162 $INSERTION=$INSERTION.$sys[$i]; | |
163 } | |
164 $INSERTION_LENGTH=length($INSERTION); | |
165 $temp1=$tstart+$blockSizes[0]+1; | |
166 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\t$INSERTION\t$INSERTION_LENGTH\n"; | |
167 | |
168 } | |
169 #intial part missing | |
170 elsif($qStop == $qSize) | |
171 { | |
172 $temp1=$tstart; | |
173 $infast_chr=$infast; | |
174 $infast_chr=~ s/\.fa//g; | |
175 $infast_chr_start=0; | |
176 $infast_chr_stop=$qStart; | |
177 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
178 #die "$sys\n"; | |
179 $sys = `$sys`; | |
180 #die "$sys\n"; | |
181 chomp($sys); | |
182 @sys=split("\n",$sys); | |
183 $INSERTION=""; | |
184 for( $i=1;$i<@sys;$i++) | |
185 { | |
186 $INSERTION=$INSERTION.$sys[$i]; | |
187 } | |
188 $INSERTION_LENGTH=length($INSERTION); | |
189 $temp1=$tstart+1; | |
190 print WRBUFF "$row[9]\tUNKNOWN\tUNKNOWN\t$Strand\t$row[13]\t$temp1\t$Strand\t$coverage\n"; | |
191 | |
192 } | |
193 else | |
194 { | |
195 print "ignoring one of the event $row[9] $i as the event is non informative \n"; | |
196 } | |
197 | |
198 } | |
199 #if number of hits for a contig is greater than one | |
200 else | |
201 { | |
202 #this flag is used to see if perfect hit not found (match val =0) | |
203 $flag1 = 0; | |
204 for(my $i=0;$i<@temp;$i++) | |
205 { | |
206 | |
207 #define blocksizes array | |
208 @row=split("\t",$temp[$i]); | |
209 $row[18] =~ s/,$//g; | |
210 @blockSizes=split(',',$row[18]); | |
211 #defining var | |
212 $qSize=$row[10]; | |
213 $qStart=$row[11]; | |
214 $qStop=$row[12]; | |
215 $tstart=$row[15]; | |
216 $tstop=$row[16]; | |
217 $Strand=$row[8]; | |
218 $coverage = $row[9]; | |
219 $coverage =~ s/\w+_//g; | |
220 #calculate match val | |
221 if(($qSize-($qStop-$qStart)) ==0) | |
222 { | |
223 $flag1=1; | |
224 #these ara non informative | |
225 if (@blockSizes ==1) | |
226 { | |
227 print "ignoring one of the event $row[9] $i as the event is non informative \n"; | |
228 print BAD_CONTIG "$row[9]\n"; | |
229 } | |
230 #Ignoring when number of blocks are more than two | |
231 if(@blockSizes > 2) | |
232 { | |
233 print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n"; | |
234 } | |
235 if(@blockSizes == 2) | |
236 { | |
237 $temp1=$tstart+$blockSizes[0]+1; | |
238 $temp2=$tstop-$blockSizes[1]-1; | |
239 | |
240 print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n"; | |
241 } | |
242 $i=@temp; | |
243 } | |
244 } | |
245 #as flag value not changed proceed to see next step | |
246 if($flag1 == 0) | |
247 { | |
248 undef(@initial); | |
249 my @initial; | |
250 for(my $i=0;$i<@temp;$i++) | |
251 { | |
252 @row=split("\t",$temp[$i]); | |
253 #print "@row\n"; | |
254 unshift(@initial,[@row]); | |
255 } | |
256 #sortin the hits according to qstart & qend | |
257 @initial = sort {$a->[11] <=> $b->[11] || $b->[12] <=> $a->[12]} @initial; | |
258 #print "$row[9]\t@initial\n"; | |
259 #if($row[9] eq "NODE_5_length_149_cov_12.395973") | |
260 #{ | |
261 # for($i=0;$i<@initial;$i++) | |
262 # { | |
263 # print "@{$initial[$i]}\n"; | |
264 # } | |
265 #} | |
266 $start = ""; | |
267 $stop = ""; | |
268 $start_len=0; | |
269 $stop_len=0; | |
270 #this super flag is used to skip processing of remaining uncessary hits | |
271 $super_flag = 0; | |
272 for($i=0;$i<@initial && $super_flag == 0;$i++) | |
273 { | |
274 $flag = 0; | |
275 #print "@{$initial[$i]}\n"; | |
276 $initial[$i][18] =~ s/,$//g; | |
277 @blockSizes1=split(',',$initial[$i][18]); | |
278 #defining var | |
279 $qSize1=$initial[$i][10]; | |
280 $qStart1=$initial[$i][11]; | |
281 $qStop1=$initial[$i][12]; | |
282 $tstart1=$initial[$i][15]; | |
283 $tstop1=$initial[$i][16]; | |
284 $Strand1=$initial[$i][8]; | |
285 $Chr1 = $initial[$i][13]; | |
286 $coverage1 = $initial[$i][9]; | |
287 $coverage1 =~ s/\w+_//g; | |
288 #die "$qSize1\t$qStart1\t$qStop1\t$tstart1\t$tstop1\t$Strand1\t$Chr1\t$coverage1\n"; | |
289 #if a hit qstart = 0 then set flag =1 | |
290 if($qStart1 == 0) | |
291 { | |
292 $flag =1; | |
293 } | |
294 #if a hit qstop = 0 then set flag =2 | |
295 if($qStop1 == $qSize1) | |
296 { | |
297 $flag =2; | |
298 } | |
299 #if($row[9] eq "NODE_5_length_149_cov_12.395973") | |
300 #{ | |
301 # print "$flag \n"; | |
302 #} | |
303 if(@blockSizes1 == 1) | |
304 { | |
305 if($flag == 1 ) | |
306 { | |
307 for($j=0;$j<@initial;$j++) | |
308 { | |
309 #both hits should not be the same | |
310 if($i != $j) | |
311 { | |
312 #print "@{$initial[$i]}\n"; | |
313 $initial[$j][18] =~ s/,$//g; | |
314 @blockSizes2=split(',',$initial[$j][18]); | |
315 #defining var | |
316 $qSize2=$initial[$j][10]; | |
317 $qStart2=$initial[$j][11]; | |
318 $qStop2=$initial[$j][12]; | |
319 $tstart2=$initial[$j][15]; | |
320 $tstop2=$initial[$j][16]; | |
321 $Strand2=$initial[$j][8]; | |
322 $coverage2 = $initial[$j][9]; | |
323 $Chr2 = $initial[$j][13]; | |
324 $coverage2 =~ s/\w+_//g; | |
325 #making sure both hits are not over lapping | |
326 if($qStart2 > $qStart1) | |
327 { #allowing +-2 bases as the this hit is immediate next continous hit | |
328 if($qStop1 >= $qStart2 -2 && $qStop1 <= $qStart2 +2 ) | |
329 { | |
330 #perfect match | |
331 if($qStop2 == $qSize2) | |
332 { | |
333 if($Strand1 eq "+") | |
334 { | |
335 $tmp1 = $tstart1+$blockSizes1[0]+1; | |
336 $tmp2 = $tstart2+$blockSizes2[0]; | |
337 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; | |
338 } | |
339 else | |
340 { | |
341 $tmp1 = $tstart1+1; | |
342 $tmp2 = $tstart2+1; | |
343 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; | |
344 | |
345 } | |
346 $super_flag = 1; | |
347 $j = @initial+1; | |
348 } | |
349 #some part is missing after the second hit | |
350 else | |
351 { | |
352 $tmp1 = $tstart1+$blockSizes1[0]; | |
353 $tmp2 = $tstart2+$blockSizes2[0]; | |
354 $INSERTION=""; | |
355 $infast_chr=$infast; | |
356 $infast_chr=~ s/\.fa//g; | |
357 $infast_chr_start=$qStop1+1; | |
358 $infast_chr_stop=$qStart2-1; | |
359 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
360 #die "$sys\n"; | |
361 $sys = `$sys`; | |
362 #die "$sys\n"; | |
363 chomp($sys); | |
364 @sys=split("\n",$sys); | |
365 for( $i=1;$i<@sys;$i++) | |
366 { | |
367 $INSERTION=$INSERTION.$sys[$i]; | |
368 } | |
369 $INSERTION_LENGTH=length($INSERTION); | |
370 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; | |
371 $super_flag = 1; | |
372 $j = @initial+1; | |
373 } | |
374 | |
375 } | |
376 #if there are some insertion between two hits | |
377 elsif($qStop2 == $qSize2) | |
378 { | |
379 $tmp1 = $tstart1+$blockSizes1[0]; | |
380 $tmp2 = $tstart2+$blockSizes2[0]; | |
381 $INSERTION=""; | |
382 $infast_chr=$infast; | |
383 $infast_chr=~ s/\.fa//g; | |
384 $infast_chr_start=$qStop2+1; | |
385 $infast_chr_stop=$qSize; | |
386 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
387 #die "$sys\n"; | |
388 $sys = `$sys`; | |
389 #die "$sys\n"; | |
390 chomp($sys); | |
391 @sys=split("\n",$sys); | |
392 for( $i=1;$i<@sys;$i++) | |
393 { | |
394 $INSERTION=$INSERTION.$sys[$i]; | |
395 } | |
396 $INSERTION_LENGTH=length($INSERTION); | |
397 print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; | |
398 $super_flag = 1; | |
399 $j = @initial+1; | |
400 } | |
401 | |
402 } | |
403 | |
404 } | |
405 } | |
406 #if none worked with other reads then only process that read | |
407 if($j == @initial) | |
408 { | |
409 #die "success\n"; | |
410 $temp1=$tstart1+$blockSizes1[0]+1; | |
411 #print WRBUFF "$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\n"; | |
412 $infast_chr=$infast; | |
413 $infast_chr=~ s/\.fa//g; | |
414 $infast_chr_start=$qStop1+1; | |
415 $infast_chr_stop=$qSize1; | |
416 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
417 #die "$sys\n"; | |
418 $sys = `$sys`; | |
419 #die "$sys\n"; | |
420 chomp($sys); | |
421 @sys=split("\n",$sys); | |
422 $INSERTION=""; | |
423 for( $i=1;$i<@sys;$i++) | |
424 { | |
425 $INSERTION=$INSERTION.$sys[$i]; | |
426 } | |
427 $INSERTION_LENGTH=length($INSERTION); | |
428 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand1\t$coverage1\n"; | |
429 $super_flag = 1; | |
430 } | |
431 } | |
432 #if query end is matched to query size | |
433 elsif($flag == 2) | |
434 { | |
435 #going through other hits | |
436 for($j=0;$j<@initial;$j++) | |
437 { | |
438 #hits should not be same | |
439 if($i != $j && $qStop2) | |
440 { | |
441 #print "@{$initial[$i]}\n"; | |
442 $initial[$j][18] =~ s/,$//g; | |
443 @blockSizes2=split(',',$initial[$j][18]); | |
444 #defining var | |
445 $qSize2=$initial[$j][10]; | |
446 $qStart2=$initial[$j][11]; | |
447 $qStop2=$initial[$j][12]; | |
448 $tstart2=$initial[$j][15]; | |
449 $tstop2=$initial[$j][16]; | |
450 $Strand2=$initial[$j][8]; | |
451 $coverage2 = $initial[$j][9]; | |
452 $Chr2 = $initial[$j][13]; | |
453 $coverage2 =~ s/\w+_//g; | |
454 #if | |
455 if($qStop2 < $qStop1) | |
456 { | |
457 if($qStart1 >= $qStop2 -2 && $qStart1 <= $qStop2 +2 ) | |
458 { | |
459 #die "$qStart1 <= $qStop2 \n"; | |
460 $tmp1 = $tstart1+$blockSizes1[0]; | |
461 $tmp2 = $tstart2+$blockSizes2[0]; | |
462 $INSERTION=""; | |
463 $infast_chr=$infast; | |
464 $infast_chr=~ s/\.fa//g; | |
465 $infast_chr_start=0; | |
466 $infast_chr_stop=$qStart1-1; | |
467 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
468 #die "test $sys\n"; | |
469 $sys = `$sys`; | |
470 #die "$sys\n"; | |
471 chomp($sys); | |
472 @sys=split("\n",$sys); | |
473 for( $i=1;$i<@sys;$i++) | |
474 { | |
475 $INSERTION=$INSERTION.$sys[$i]; | |
476 } | |
477 $INSERTION_LENGTH=length($INSERTION); | |
478 print WRBUFF "$initial[$i][9]\t$Chr2\t$tmp2\t$Strand2\t$Chr1\t$tmp1\t$Strand1\t$coverage1\n"; | |
479 $super_flag = 1; | |
480 $j = @initial+1; | |
481 | |
482 } | |
483 | |
484 } | |
485 } | |
486 } | |
487 if($j == @initial) | |
488 { | |
489 $infast_chr=$infast; | |
490 $infast_chr=~ s/\.fa//g; | |
491 $infast_chr_start=0; | |
492 $infast_chr_stop=$qStart1; | |
493 $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; | |
494 #die "test $sys\n"; | |
495 $sys = `$sys`; | |
496 #die "$sys\n"; | |
497 chomp($sys); | |
498 @sys=split("\n",$sys); | |
499 $INSERTION=""; | |
500 for( $i=1;$i<@sys;$i++) | |
501 { | |
502 $INSERTION=$INSERTION.$sys[$i]; | |
503 } | |
504 $INSERTION_LENGTH=length($INSERTION); | |
505 $tmp = $tstart1+1; | |
506 print WRBUFF "$initial[$i][9]\tUNKNOWN\tUNKNOWN\t$Strand1\t$Chr1\t$tmp\t$Strand1\t$coverage1\n"; | |
507 $super_flag = 1; | |
508 } | |
509 } | |
510 } | |
511 elsif(@blockSizes == 2) | |
512 { | |
513 $temp1=$tstart1+$blockSizes[0]+1; | |
514 $temp2=$tstop1-$blockSizes[1]-1; | |
515 print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\t$Chr1\t$temp2\t$Strand1\t$coverage1\n"; | |
516 | |
517 } | |
518 } | |
519 } | |
520 | |
521 } | |
522 close(WRBUFF); | |
523 | |
524 undef(@temp); | |
525 } | |
526 |