view rapsodyn/ParseBlastForUniqueMatch.pl @ 4:9074a5104cdd draft

Uploaded
author mcharles
date Wed, 17 Sep 2014 04:20:08 -0400
parents 442a7c88b886
children 3f7b0788a1c4
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;


my $input_variant_file = $ARGV[0];
my $input_blast_file = $ARGV[1];
my $window_length = $ARGV[2];
my $nb_mismatch_max = $ARGV[3];

my %hash_name;

open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
while (my $line =<INB>){
	my @fields = split (/\s+/,$line);
	# print $#fields,"\n";
	# print $line;
	# exit(0);
	my $query_name;
	my $query_start;
	my $query_stop;
	my $query_aln;
	my $subject_aln;
	my $compt_mismatch_5p=0;
	my $compt_mismatch_3p=0;
	
	if ($#fields == 24){
		$query_name = $fields[0];
		$query_start = $fields[6];
		$query_stop = $fields[7];
		$query_aln = $fields[20];
		$subject_aln = $fields[21];
	}
	elsif ($#fields == 4){
		$query_name = $fields[0];
		$query_start = $fields[1];
		$query_stop = $fields[2];
		$query_aln = $fields[3];
		$subject_aln = $fields[4];
	}
	else {
		print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
		exit(0);
	}
	
	
	my @field_name = split (/\_/,$query_name);
	my $name = $field_name[0]."_".$field_name[1];
	if ($query_name =~ /random/){
		$name .= "_".$field_name[2];
		# print $name."\n";
	
	}
	if (!$hash_name{$name}){
		$hash_name{$name}=0;
	}
	elsif ($hash_name{$name}>1){
		next;
	}
	# if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
		# print "READ : $query_name\n$name\n";
		# print "HASH : ",$hash_name{$name},"\n";
		# # exit(0);
	# }
	
	
	
	chomp($query_aln);
	chomp($subject_aln);
	
	my $nb_gap_query=0;
	
	if (length($query_aln) == length($subject_aln)){
		if (length($query_aln)<$window_length-$nb_mismatch_max){
		}
		else {
			my @q = split(//,$query_aln);
			my @s = split(//,$subject_aln);
			for (my $i=0;$i<=$#q;$i++){
				my $global_idx = $query_start-1+$i-$nb_gap_query;
				if ($q[$i] eq "-"){
					if ($global_idx < $window_length){
						$compt_mismatch_5p++;
					}
					elsif ($global_idx > $window_length){
						$compt_mismatch_3p++;
					}
					$nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
				}
				else {
					if ($q[$i] ne $s[$i]){
						if ($global_idx < $window_length){
							$compt_mismatch_5p++;
						}
						elsif ($global_idx > $window_length){
							$compt_mismatch_3p++;
						}
					}
				}
			}
			$compt_mismatch_5p += $query_start-1;
			$compt_mismatch_3p += $window_length *2 + 1 - $query_stop;
			
			# for (my $i=0;$i<$window_length;$i++){
				# if ($tbl_q_aln[$i] eq "#"){
					# $compt_mismatch_5p++;
				# }
				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
					# $compt_mismatch_5p++;
				# }
				# else {
				# }

			# }
			# for (my $i=$window_length+1;$i<=$window_length*2;$i++){
				# if ($tbl_q_aln[$i] eq "#"){
					# $compt_mismatch_3p++;
				# }
				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
					# $compt_mismatch_3p++;
				# }
				# else {
				# }
			# }
			if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){
				$hash_name{$name}++;
			}
			
		}
	}
	else {
		print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
	}
	
	# if ($line=~/chrCnn_random_49828229/){
		# print $line;
		# print $query_aln,"\n";
		# print $subject_aln,"\n";
		# print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
		# print $hash_name{"chrCnn_random_49828229"},"\n";
		# print "\n";
	# }
	
	
	
}
# exit(0);
	
close (INB);

open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");

while (my $ligne = <INV>) {
	
	my @champs = split (/\s+/,$ligne);
	my $header = $champs[0]."_".$champs[1];

	if ($hash_name{$header}){
		if ($hash_name{$header}==1){
			print $ligne;
		}
	}
	else {
		#print STDERR "No blast result for ",$header,"\n";
	}

	
}

close(INV);


# foreach my $key (sort keys %hash_name){
	# print $key,"\t",$hash_name{$key},"\n";
	
# }