Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
view 2.4/src/blat_parse.pl @ 13:e3609c8714fb draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Fri, 30 May 2014 03:37:55 -0400 |
parents | |
children |
line wrap: on
line source
##################################################################################################################################################### #Purpose: To parse blat psl file #Date: 07-30-2013 ##################################################################################################################################################### use Getopt::Long; use Cwd; #reading input arguments &Getopt::Long::GetOptions( 'b|BLAT_OUT=s'=> \$blat_out, 'temp:s'=>\$dirtemp, 'f|FASTA=s'=>\$infast, ); $blat_out =~ s/\s|\t|\r|\n//g; $dirtemp =~ s/\s|\t|\r|\n//g; $infast =~ s/\s|\t|\r|\n//g; $samtools=`which samtools`; $samtools =~ s/\s|\t|\r|\n//g; if($blat_out eq "" || $infast eq "" ) { die "Try: perl blat_parse.pl -b <PSL FILE> -f <Contigs.fa> -temp temporary file directory \n"; } if (!(-e $samtools)) { die "samtools must be in your path\n"; } if (!(-e $infast)) { die "input fasta file doesn't exit\n"; } unless(-d $dirtemp) { #system("mkdir -p $dirtemp"); $dirtemp= getcwd; } #opening the blat output file open(BUFF,$blat_out) or die "no file found $blat_out\n"; open(WRBUFF,">$dirtemp/Temp_out.txt") or die "not able to write the file \n"; #parsing throught he file while(<BUFF>) { if($_ =~ m/^\d/) { print WRBUFF $_; } else { print "ignoring headers $.\n"; } } close(WRBUFF); system("sort -k10,10 -k18,18n $dirtemp/Temp_out.txt > $dirtemp/Temp_out1.txt"); system("mv $dirtemp/Temp_out1.txt $dirtemp/Temp_out.txt"); open(BUFF,"$dirtemp/Temp_out.txt") or die "no file found Temp_out.txt\n"; open(WRBUFF,">$dirtemp/File1_out.txt") or die "not able to write the file \n"; close(WRBUFF); $prev_contig_name=""; my @temp; #parsing throught he file while(<BUFF>) { chomp($_); split "\t"; if($_[9] ne $prev_contig_name) { if($prev_contig_name ne "") { #print @temp."\n"; #print @temp."\n"; &processing(@temp); } undef(@temp); push(@temp,$_); } else { push(@temp,$_); } $prev_contig_name=$_[9]; } #processing last record &processing(@temp); #print @temp."\n"; close(BUFF); ##################SUBROUTINES###################### #actual processing of each record in the temp array(same query name objects) sub processing { open(WRBUFF,">>$dirtemp/File1_out.txt") or die "not able to write the file \n"; open(BAD_CONTIG,">>$dirtemp/bad_contig.out.txt") or die "not able to write the file \n"; @temp = @_; #if number of hits for a contig is one if(@temp == 1) { $i=0; #define blocksizes array @row=split("\t",$temp[$i]); $row[18] =~ s/,$//g; @blockSizes=split(',',$row[18]); #defining var $qSize=$row[10]; $qStart=$row[11]; $qStop=$row[12]; $tstart=$row[15]; $tstop=$row[16]; $Strand=$row[8]; $coverage = $row[9]; $coverage =~ s/\w+_//g; #calculate match val if(($qSize-($qStop-$qStart)) ==0) { $flag=1; #these ara non informative if (@blockSizes ==1) { print "ignoring one of the event $row[9] $i as the event is non informative \n"; print BAD_CONTIG "$row[9]\n"; } #Ignoring when number of blocks are more than two if(@blockSizes > 2) { print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n"; } #if number of blocks is equal to 2 if(@blockSizes == 2) { $temp1=$tstart+$blockSizes[0]+1; $temp2=$tstop-$blockSizes[1]-1; print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n"; } $i=@temp; } #later part missing elsif($qStart ==0) { $temp1=$tstart+$blockSizes[0]+1; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=$qStop+1; $infast_chr_stop=$qSize; $sys="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; $sys = `$sys`; chomp($sys); @sys=split("\n",$sys); $INSERTION=""; for($i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); $temp1=$tstart+$blockSizes[0]+1; print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\t$INSERTION\t$INSERTION_LENGTH\n"; } #intial part missing elsif($qStop == $qSize) { $temp1=$tstart; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=0; $infast_chr_stop=$qStart; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "$sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); $INSERTION=""; for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); $temp1=$tstart+1; print WRBUFF "$row[9]\tUNKNOWN\tUNKNOWN\t$Strand\t$row[13]\t$temp1\t$Strand\t$coverage\n"; } else { print "ignoring one of the event $row[9] $i as the event is non informative \n"; } } #if number of hits for a contig is greater than one else { #this flag is used to see if perfect hit not found (match val =0) $flag1 = 0; for(my $i=0;$i<@temp;$i++) { #define blocksizes array @row=split("\t",$temp[$i]); $row[18] =~ s/,$//g; @blockSizes=split(',',$row[18]); #defining var $qSize=$row[10]; $qStart=$row[11]; $qStop=$row[12]; $tstart=$row[15]; $tstop=$row[16]; $Strand=$row[8]; $coverage = $row[9]; $coverage =~ s/\w+_//g; #calculate match val if(($qSize-($qStop-$qStart)) ==0) { $flag1=1; #these ara non informative if (@blockSizes ==1) { print "ignoring one of the event $row[9] $i as the event is non informative \n"; print BAD_CONTIG "$row[9]\n"; } #Ignoring when number of blocks are more than two if(@blockSizes > 2) { print "ignoring event $row[9] $. AS BLOCK SIZE is greater than 2\n"; } if(@blockSizes == 2) { $temp1=$tstart+$blockSizes[0]+1; $temp2=$tstop-$blockSizes[1]-1; print WRBUFF "$row[9]\t$row[13]\t$temp1\t$Strand\t$row[13]\t$temp2\t$Strand\t$coverage\n"; } $i=@temp; } } #as flag value not changed proceed to see next step if($flag1 == 0) { undef(@initial); my @initial; for(my $i=0;$i<@temp;$i++) { @row=split("\t",$temp[$i]); #print "@row\n"; unshift(@initial,[@row]); } #sortin the hits according to qstart & qend @initial = sort {$a->[11] <=> $b->[11] || $b->[12] <=> $a->[12]} @initial; #print "$row[9]\t@initial\n"; #if($row[9] eq "NODE_5_length_149_cov_12.395973") #{ # for($i=0;$i<@initial;$i++) # { # print "@{$initial[$i]}\n"; # } #} $start = ""; $stop = ""; $start_len=0; $stop_len=0; #this super flag is used to skip processing of remaining uncessary hits $super_flag = 0; for($i=0;$i<@initial && $super_flag == 0;$i++) { $flag = 0; #print "@{$initial[$i]}\n"; $initial[$i][18] =~ s/,$//g; @blockSizes1=split(',',$initial[$i][18]); #defining var $qSize1=$initial[$i][10]; $qStart1=$initial[$i][11]; $qStop1=$initial[$i][12]; $tstart1=$initial[$i][15]; $tstop1=$initial[$i][16]; $Strand1=$initial[$i][8]; $Chr1 = $initial[$i][13]; $coverage1 = $initial[$i][9]; $coverage1 =~ s/\w+_//g; #die "$qSize1\t$qStart1\t$qStop1\t$tstart1\t$tstop1\t$Strand1\t$Chr1\t$coverage1\n"; #if a hit qstart = 0 then set flag =1 if($qStart1 == 0) { $flag =1; } #if a hit qstop = 0 then set flag =2 if($qStop1 == $qSize1) { $flag =2; } #if($row[9] eq "NODE_5_length_149_cov_12.395973") #{ # print "$flag \n"; #} if(@blockSizes1 == 1) { if($flag == 1 ) { for($j=0;$j<@initial;$j++) { #both hits should not be the same if($i != $j) { #print "@{$initial[$i]}\n"; $initial[$j][18] =~ s/,$//g; @blockSizes2=split(',',$initial[$j][18]); #defining var $qSize2=$initial[$j][10]; $qStart2=$initial[$j][11]; $qStop2=$initial[$j][12]; $tstart2=$initial[$j][15]; $tstop2=$initial[$j][16]; $Strand2=$initial[$j][8]; $coverage2 = $initial[$j][9]; $Chr2 = $initial[$j][13]; $coverage2 =~ s/\w+_//g; #making sure both hits are not over lapping if($qStart2 > $qStart1) { #allowing +-2 bases as the this hit is immediate next continous hit if($qStop1 >= $qStart2 -2 && $qStop1 <= $qStart2 +2 ) { #perfect match if($qStop2 == $qSize2) { if($Strand1 eq "+") { $tmp1 = $tstart1+$blockSizes1[0]+1; $tmp2 = $tstart2+$blockSizes2[0]; print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; } else { $tmp1 = $tstart1+1; $tmp2 = $tstart2+1; print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; } $super_flag = 1; $j = @initial+1; } #some part is missing after the second hit else { $tmp1 = $tstart1+$blockSizes1[0]; $tmp2 = $tstart2+$blockSizes2[0]; $INSERTION=""; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=$qStop1+1; $infast_chr_stop=$qStart2-1; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "$sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; $super_flag = 1; $j = @initial+1; } } #if there are some insertion between two hits elsif($qStop2 == $qSize2) { $tmp1 = $tstart1+$blockSizes1[0]; $tmp2 = $tstart2+$blockSizes2[0]; $INSERTION=""; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=$qStop2+1; $infast_chr_stop=$qSize; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "$sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); print WRBUFF "$initial[$i][9]\t$Chr1\t$tmp1\t$Strand1\t$Chr2\t$tmp2\t$Strand2\t$coverage1\n"; $super_flag = 1; $j = @initial+1; } } } } #if none worked with other reads then only process that read if($j == @initial) { #die "success\n"; $temp1=$tstart1+$blockSizes1[0]+1; #print WRBUFF "$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand\t$coverage\n"; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=$qStop1+1; $infast_chr_stop=$qSize1; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "$sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); $INSERTION=""; for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\tUNKNOWN\tUNKNOWN\t$Strand1\t$coverage1\n"; $super_flag = 1; } } #if query end is matched to query size elsif($flag == 2) { #going through other hits for($j=0;$j<@initial;$j++) { #hits should not be same if($i != $j && $qStop2) { #print "@{$initial[$i]}\n"; $initial[$j][18] =~ s/,$//g; @blockSizes2=split(',',$initial[$j][18]); #defining var $qSize2=$initial[$j][10]; $qStart2=$initial[$j][11]; $qStop2=$initial[$j][12]; $tstart2=$initial[$j][15]; $tstop2=$initial[$j][16]; $Strand2=$initial[$j][8]; $coverage2 = $initial[$j][9]; $Chr2 = $initial[$j][13]; $coverage2 =~ s/\w+_//g; #if if($qStop2 < $qStop1) { if($qStart1 >= $qStop2 -2 && $qStart1 <= $qStop2 +2 ) { #die "$qStart1 <= $qStop2 \n"; $tmp1 = $tstart1+$blockSizes1[0]; $tmp2 = $tstart2+$blockSizes2[0]; $INSERTION=""; $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=0; $infast_chr_stop=$qStart1-1; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "test $sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); print WRBUFF "$initial[$i][9]\t$Chr2\t$tmp2\t$Strand2\t$Chr1\t$tmp1\t$Strand1\t$coverage1\n"; $super_flag = 1; $j = @initial+1; } } } } if($j == @initial) { $infast_chr=$infast; $infast_chr=~ s/\.fa//g; $infast_chr_start=0; $infast_chr_stop=$qStart1; $sys ="$samtools faidx $infast $infast_chr:$infast_chr_start-$infast_chr_stop"; #die "test $sys\n"; $sys = `$sys`; #die "$sys\n"; chomp($sys); @sys=split("\n",$sys); $INSERTION=""; for( $i=1;$i<@sys;$i++) { $INSERTION=$INSERTION.$sys[$i]; } $INSERTION_LENGTH=length($INSERTION); $tmp = $tstart1+1; print WRBUFF "$initial[$i][9]\tUNKNOWN\tUNKNOWN\t$Strand1\t$Chr1\t$tmp\t$Strand1\t$coverage1\n"; $super_flag = 1; } } } elsif(@blockSizes == 2) { $temp1=$tstart1+$blockSizes[0]+1; $temp2=$tstop1-$blockSizes[1]-1; print WRBUFF "$initial[$i][9]\t$Chr1\t$temp1\t$Strand1\t$Chr1\t$temp2\t$Strand1\t$coverage1\n"; } } } } close(WRBUFF); undef(@temp); }