| 0 | 1 #!/usr/bin/perl | 
| 7 | 2 #V1.0.1 added log, option parameters | 
| 0 | 3 use strict; | 
|  | 4 use warnings; | 
| 7 | 5 use Getopt::Long; | 
| 0 | 6 | 
|  | 7 | 
| 7 | 8 my $input_variant_file; | 
|  | 9 my $input_blast_file; | 
|  | 10 my $log_file; | 
|  | 11 my $WINDOW_LENGTH= 50; | 
|  | 12 my $NB_MISMATCH_MAX = 3; | 
| 0 | 13 | 
| 7 | 14 GetOptions ( | 
|  | 15 "input_variant_file=s" => \$input_variant_file, | 
|  | 16 "input_blast_file=s" => \$input_blast_file, | 
|  | 17 "window_length=s" => \$WINDOW_LENGTH, | 
|  | 18 "log_file=s" => \$log_file, | 
|  | 19 "nb_mismatch_max=s" => \$NB_MISMATCH_MAX | 
|  | 20 ) or die("Error in command line arguments\n"); | 
|  | 21 | 
|  | 22 my $nb_variant_checked=0; | 
|  | 23 my $nb_variant_selected=0; | 
| 0 | 24 my %hash_name; | 
|  | 25 | 
|  | 26 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); | 
|  | 27 while (my $line =<INB>){ | 
|  | 28 	my @fields = split (/\s+/,$line); | 
|  | 29 	# print $#fields,"\n"; | 
|  | 30 	# print $line; | 
|  | 31 	# exit(0); | 
|  | 32 	my $query_name; | 
|  | 33 	my $query_start; | 
|  | 34 	my $query_stop; | 
|  | 35 	my $query_aln; | 
|  | 36 	my $subject_aln; | 
|  | 37 	my $compt_mismatch_5p=0; | 
|  | 38 	my $compt_mismatch_3p=0; | 
|  | 39 | 
|  | 40 	if ($#fields == 24){ | 
|  | 41 		$query_name = $fields[0]; | 
|  | 42 		$query_start = $fields[6]; | 
|  | 43 		$query_stop = $fields[7]; | 
|  | 44 		$query_aln = $fields[20]; | 
|  | 45 		$subject_aln = $fields[21]; | 
|  | 46 	} | 
|  | 47 	elsif ($#fields == 4){ | 
|  | 48 		$query_name = $fields[0]; | 
|  | 49 		$query_start = $fields[1]; | 
|  | 50 		$query_stop = $fields[2]; | 
|  | 51 		$query_aln = $fields[3]; | 
|  | 52 		$subject_aln = $fields[4]; | 
|  | 53 	} | 
|  | 54 	else { | 
|  | 55 		print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n"; | 
|  | 56 		exit(0); | 
|  | 57 	} | 
|  | 58 | 
|  | 59 | 
|  | 60 	my @field_name = split (/\_/,$query_name); | 
|  | 61 	my $name = $field_name[0]."_".$field_name[1]; | 
|  | 62 	if ($query_name =~ /random/){ | 
|  | 63 		$name .= "_".$field_name[2]; | 
|  | 64 		# print $name."\n"; | 
|  | 65 | 
|  | 66 	} | 
|  | 67 	if (!$hash_name{$name}){ | 
|  | 68 		$hash_name{$name}=0; | 
|  | 69 	} | 
|  | 70 	elsif ($hash_name{$name}>1){ | 
|  | 71 		next; | 
|  | 72 	} | 
|  | 73 	# if ($query_name eq "chrUnn_random_8279117_1_M1_a"){ | 
|  | 74 		# print "READ : $query_name\n$name\n"; | 
|  | 75 		# print "HASH : ",$hash_name{$name},"\n"; | 
|  | 76 		# # exit(0); | 
|  | 77 	# } | 
|  | 78 | 
|  | 79 | 
|  | 80 | 
|  | 81 	chomp($query_aln); | 
|  | 82 	chomp($subject_aln); | 
|  | 83 | 
|  | 84 	my $nb_gap_query=0; | 
|  | 85 | 
|  | 86 	if (length($query_aln) == length($subject_aln)){ | 
| 7 | 87 		if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){ | 
| 0 | 88 		} | 
|  | 89 		else { | 
|  | 90 			my @q = split(//,$query_aln); | 
|  | 91 			my @s = split(//,$subject_aln); | 
|  | 92 			for (my $i=0;$i<=$#q;$i++){ | 
|  | 93 				my $global_idx = $query_start-1+$i-$nb_gap_query; | 
|  | 94 				if ($q[$i] eq "-"){ | 
| 7 | 95 					if ($global_idx < $WINDOW_LENGTH){ | 
| 0 | 96 						$compt_mismatch_5p++; | 
|  | 97 					} | 
| 7 | 98 					elsif ($global_idx > $WINDOW_LENGTH){ | 
| 0 | 99 						$compt_mismatch_3p++; | 
|  | 100 					} | 
|  | 101 					$nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global | 
|  | 102 				} | 
|  | 103 				else { | 
|  | 104 					if ($q[$i] ne $s[$i]){ | 
| 7 | 105 						if ($global_idx < $WINDOW_LENGTH){ | 
| 0 | 106 							$compt_mismatch_5p++; | 
|  | 107 						} | 
| 7 | 108 						elsif ($global_idx > $WINDOW_LENGTH){ | 
| 0 | 109 							$compt_mismatch_3p++; | 
|  | 110 						} | 
|  | 111 					} | 
|  | 112 				} | 
|  | 113 			} | 
|  | 114 			$compt_mismatch_5p += $query_start-1; | 
| 7 | 115 			$compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop; | 
| 0 | 116 | 
|  | 117 			# for (my $i=0;$i<$window_length;$i++){ | 
|  | 118 				# if ($tbl_q_aln[$i] eq "#"){ | 
|  | 119 					# $compt_mismatch_5p++; | 
|  | 120 				# } | 
|  | 121 				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ | 
|  | 122 					# $compt_mismatch_5p++; | 
|  | 123 				# } | 
|  | 124 				# else { | 
|  | 125 				# } | 
|  | 126 | 
|  | 127 			# } | 
|  | 128 			# for (my $i=$window_length+1;$i<=$window_length*2;$i++){ | 
|  | 129 				# if ($tbl_q_aln[$i] eq "#"){ | 
|  | 130 					# $compt_mismatch_3p++; | 
|  | 131 				# } | 
|  | 132 				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ | 
|  | 133 					# $compt_mismatch_3p++; | 
|  | 134 				# } | 
|  | 135 				# else { | 
|  | 136 				# } | 
|  | 137 			# } | 
| 7 | 138 			if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){ | 
| 0 | 139 				$hash_name{$name}++; | 
|  | 140 			} | 
|  | 141 | 
|  | 142 		} | 
|  | 143 	} | 
|  | 144 	else { | 
|  | 145 		print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n"; | 
|  | 146 	} | 
|  | 147 | 
|  | 148 	# if ($line=~/chrCnn_random_49828229/){ | 
|  | 149 		# print $line; | 
|  | 150 		# print $query_aln,"\n"; | 
|  | 151 		# print $subject_aln,"\n"; | 
|  | 152 		# print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n"; | 
|  | 153 		# print $hash_name{"chrCnn_random_49828229"},"\n"; | 
|  | 154 		# print "\n"; | 
|  | 155 	# } | 
|  | 156 | 
|  | 157 | 
|  | 158 | 
|  | 159 } | 
|  | 160 # exit(0); | 
|  | 161 | 
|  | 162 close (INB); | 
|  | 163 | 
|  | 164 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); | 
|  | 165 | 
|  | 166 while (my $ligne = <INV>) { | 
| 7 | 167 	$nb_variant_checked++; | 
| 0 | 168 | 
|  | 169 	my @champs = split (/\s+/,$ligne); | 
|  | 170 	my $header = $champs[0]."_".$champs[1]; | 
|  | 171 | 
|  | 172 	if ($hash_name{$header}){ | 
|  | 173 		if ($hash_name{$header}==1){ | 
|  | 174 			print $ligne; | 
| 7 | 175 			$nb_variant_selected++; | 
| 0 | 176 		} | 
|  | 177 	} | 
|  | 178 	else { | 
|  | 179 		#print STDERR "No blast result for ",$header,"\n"; | 
|  | 180 	} | 
|  | 181 | 
|  | 182 | 
|  | 183 } | 
|  | 184 | 
|  | 185 close(INV); | 
|  | 186 | 
| 7 | 187 open (LF,">$log_file") or die("Can't open $log_file\n"); | 
|  | 188 print LF "\n####\t Blast filtering \n"; | 
|  | 189 print LF "Variant checked  :\t$nb_variant_checked\n"; | 
|  | 190 print LF "Variant selected :\t$nb_variant_selected\n"; | 
|  | 191 close (LF); | 
|  | 192 | 
| 0 | 193 | 
|  | 194 # foreach my $key (sort keys %hash_name){ | 
|  | 195 	# print $key,"\t",$hash_name{$key},"\n"; | 
|  | 196 | 
|  | 197 # } |