Mercurial > repos > mcharles > genephys
view genephys/MergeBlastResults.pl @ 3:8dfa09868059 draft
Uploaded
author | mcharles |
---|---|
date | Fri, 24 Oct 2014 05:54:20 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl #V1.0.3 header added #V1.0.2 suppressed the final sort (very heavyload) and replaced it by another level of hash #V1.0.1 added log, option parameters use strict; use warnings; use Getopt::Long; my $inputblast; my $outputjoin; my $log_file; my $MAX_OVERLAP_FRACTION = 0.5; my $MAX_OVERLAP_LENGTH_IGNORED = 3; my $VERBOSE = "OFF"; my $ALLOWED_GAP_FRACTION_FOR_MERGING = 0.3; my $HEADER =""; GetOptions ( "input_blasttab_file=s" => \$inputblast, "output_joinmatch_file=s" => \$outputjoin, "log_file=s" => \$log_file, "header=s" => \$HEADER, "max_overlap_fraction=f" => \$MAX_OVERLAP_FRACTION, "max_overlap_length_ignored=i" =>\$MAX_OVERLAP_LENGTH_IGNORED ) or die("Error in command line arguments\n"); open(IB, $inputblast) or die ("Can't open $inputblast \n"); open (LF,">$log_file") or die("Can't open $log_file\n"); my %match_by_query; my @query_keys; my %querys; my $stats_nb_match=0; my $stats_included=0; my $stats_large_overlapping=0; my %stats_query_coverage; $stats_query_coverage{"0-10%"}=0; $stats_query_coverage{"10-20%"}=0; $stats_query_coverage{"20-30%"}=0; $stats_query_coverage{"30-40%"}=0; $stats_query_coverage{"40-50%"}=0; $stats_query_coverage{"50-60%"}=0; $stats_query_coverage{"60-70%"}=0; $stats_query_coverage{"70-80%"}=0; $stats_query_coverage{"80-90%"}=0; $stats_query_coverage{"90-100%"}=0; my $current_query=""; while (my $ligne = <IB>){ my @fields = split (/\t/,$ligne); if ($#fields != 9){ print STDERR "Invalid blasttab format, must have 10 columns\n"; exit(0); } $stats_nb_match++; my %match; $match{"Query"}=$fields[0]; if (!$querys{$match{"Query"}}){ push(@query_keys,$match{"Query"}); $querys{$match{"Query"}} = 1; } $match{"Subject_id"}=$fields[1]; $match{"Orientation"}="+"; $match{"Query_start"}=$fields[2]; $match{"Query_end"}=$fields[3]; $match{"Subject_start"}=$fields[4]; $match{"Subject_end"}=$fields[5]; if ($fields[2]>$fields[3]){ $match{"Query_start"}=$fields[3]; $match{"Query_end"}=$fields[2]; $match{"Orientation"}="-"; #print "- $ligne"; } if ($fields[4]>$fields[5]){ $match{"Subject_start"}=$fields[5]; $match{"Subject_end"}=$fields[4]; $match{"Orientation"}="-"; #print "- $ligne"; } $match{"Similarity"}=$fields[6]; $match{"Query_length"}=$fields[7]; $match{"Subject_length"}=$fields[8]; chomp($fields[9]); $match{"Subject"}=$fields[9]; $match{"Ligne"}=$ligne; my $querykey = $match{"Query"}; my $key = $match{"Query"}."##".$match{"Subject"}."##".$match{"Orientation"}; if ($match{"Subject_length"}==0){ print LF "Match 0",$ligne,"\n",$match{"Subject_length"},"\n"; } my %match_by_query_and_subject; my @match_table; if ($match_by_query{$querykey}){ %match_by_query_and_subject = %{$match_by_query{$querykey}}; } if ($match_by_query_and_subject{$key}){ @match_table=@{$match_by_query_and_subject{$key}}; } push (@match_table,\%match); $match_by_query_and_subject{$key} = \@match_table; $match_by_query{$querykey}= \%match_by_query_and_subject; } close (IB); #print LF "NB query : $#query_keys\n"; #foreach my $querykey (sort @query_keys){ # my %current_match_by_query_and_subject = %{$match_by_query{$querykey}}; # foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){ # my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}}; # for (my $i=0;$i<=$#current_match_table;$i++){ # my %current_match = %{$current_match_table[$i]}; # print LF $current_match{"Ligne"}."\n"; # } # } #} #exit(0); open (OJ, ">$outputjoin") or die ("Can't open $outputjoin \n"); print OJ "##",$HEADER,"\n"; print OJ "#Query\tSubject_Id\torientation\tQuery_coverage\tSubject_coverage\tIdentity\tmin_query\tmax_query\tmin_subject\tmax_subject\tNBmatch\tq_length\tsub_length\tsubject\n"; foreach my $querykey (sort @query_keys){ my %current_match_by_query_and_subject = %{$match_by_query{$querykey}}; my @match_joined; #la table qui contient les matchs (hash) projeté de chaque subject pour une query foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){ my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}}; my @duplicate; my @overlap; for (my $i=0;$i<=$#current_match_table;$i++){ push (@duplicate,0); } if ($VERBOSE eq "ON"){ print LF "\nTable Match ($#current_match_table)\t$key\n"; for (my $i=0;$i<=$#current_match_table;$i++){ my %match=%{$current_match_table[$i]}; print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; } } #Scan d'inclusion strict for (my $i=0;$i<=$#current_match_table;$i++){ my %match1=%{$current_match_table[$i]}; for (my $j=0;$j<=$#current_match_table;$j++){ if (($j != $i)&&($duplicate[$j]==0)){ # On scan dans les deux sens, pas seulement $j = $i+1 a cause du last; my %match2=%{$current_match_table[$j]}; # Inclus Subject if (($match1{"Subject_start"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) { $duplicate[$i]=1; last; } # Inclus Query elsif (($match1{"Query_start"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) { $duplicate[$i]=2; last; } } } } my @current_match_table_filtered; for (my $i=0;$i<=$#current_match_table;$i++){ if ($duplicate[$i] == 0){ push (@current_match_table_filtered,$current_match_table[$i]); } else{ $stats_included++; } } if ($#current_match_table > $#current_match_table_filtered){ @current_match_table = @current_match_table_filtered; if ($VERBOSE eq "ON"){ print LF "Table Match filtered 1 ($#current_match_table)\n"; for (my $i=0;$i<=$#current_match_table;$i++){ my %match=%{$current_match_table[$i]}; print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; } } } undef @duplicate; for (my $i=0;$i<=$#current_match_table;$i++){ push (@duplicate,0); } ######## ###Scan d'overlap trop important # D'abord subject for (my $i=0;$i<=$#current_match_table-1;$i++){ my %match1=%{$current_match_table[$i]}; for (my $j=$i+1;$j<=$#current_match_table;$j++){ my %match2=%{$current_match_table[$j]}; if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) { my $overlap_length = $match1{"Subject_end"} - $match2{"Subject_start"}+1; my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$j]=1; } elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$i]=1; } else { } } elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) #Recherche d'inclusion dans les deux sens car els match sont classé par query coord, par par subject coord { my $overlap_length = $match2{"Subject_end"} - $match1{"Subject_start"}+1; my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$j]=1; } elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$i]=1; } else { } } } } undef @current_match_table_filtered; for (my $i=0;$i<=$#current_match_table;$i++){ if ($duplicate[$i] == 0){ push (@current_match_table_filtered,$current_match_table[$i]); } else { $stats_large_overlapping++; } } if ($#current_match_table > $#current_match_table_filtered){ @current_match_table = @current_match_table_filtered; if ($VERBOSE eq "ON"){ print LF "Table Match filtered 2 ($#current_match_table)\n"; for (my $i=0;$i<=$#current_match_table;$i++){ my %match=%{$current_match_table[$i]}; print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; } } } undef @duplicate; for (my $i=0;$i<=$#current_match_table;$i++){ push (@duplicate,0); } # Ensuite Query (Subject puis Query pour evitez des deletions complementaires query update qui enleverait toutes les entrées) for (my $i=0;$i<=$#current_match_table-1;$i++){ my %match1=%{$current_match_table[$i]}; for (my $j=$i+1;$j<=$#current_match_table;$j++){ my %match2=%{$current_match_table[$j]}; if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) { my $overlap_length = $match1{"Query_end"} - $match2{"Query_start"}+1; my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1; my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1; if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$j]=1; } elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ $duplicate[$i]=1; } else { } } #un seul sans les query sont classés } } undef @current_match_table_filtered; for (my $i=0;$i<=$#current_match_table;$i++){ if ($duplicate[$i] == 0){ push (@current_match_table_filtered,$current_match_table[$i]); } else { $stats_large_overlapping++; } } if ($#current_match_table > $#current_match_table_filtered){ @current_match_table = @current_match_table_filtered; if ($VERBOSE eq "ON"){ print LF "Table Match filtered 3 ($#current_match_table)\n"; for (my $i=0;$i<=$#current_match_table;$i++){ my %match=%{$current_match_table[$i]}; print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; } } } #Fusion des Hsp et Calcul des nouvelles metriques my $overlap_length = 0; my $Subject_coverage; my $Query_coverage; my $Identity; my $nb_match = 0; my $nb_covered_subject=0; my $nb_covered_query=0; my %match=%{$current_match_table[0]}; my $q_length = $match{"Query_length"}; my $sub_length = $match{"Subject_length"}; my $subject = $match{"Subject"}; my $min_query = $match{"Query_start"}; my $max_query = $match{"Query_end"}; my $min_subject = $match{"Subject_start"}; my $max_subject = $match{"Subject_end"}; my $orientation = $match{"Orientation"}; my $Query = $match{"Query"}; my $Subject_Id = $match{"Subject_id"}; my $Subject = $match{"Subject"}; for (my $i=0;$i<=$#current_match_table;$i++){ my %match1=%{$current_match_table[$i]}; $nb_covered_subject += $match1{"Subject_end"} - $match1{"Subject_start"} +1; $nb_covered_query += $match1{"Query_end"} - $match1{"Query_start"} +1; if ($match1{"Query_start"}<$min_query){ $min_query = $match1{"Query_start"}; } if ($match1{"Query_end"}>$max_query){ $max_query = $match1{"Query_end"}; } if ($match1{"Subject_start"}<$min_subject){ $min_subject = $match1{"Subject_start"}; } if ($match1{"Subject_end"}>$max_subject){ $max_subject = $match1{"Subject_end"}; } $nb_match += $match1{"Similarity"}; for (my $j=$i+1;$j<=$#current_match_table;$j++){ my $overlap_query=0; my $overlap_subject=0; my %match2=%{$current_match_table[$j]}; if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) { $overlap_subject = $match1{"Subject_end"} - $match2{"Subject_start"}+1; } elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) { $overlap_subject = $match2{"Subject_end"} - $match1{"Subject_start"}+1; } if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) { $overlap_query = $match1{"Query_end"} - $match2{"Query_start"} +1; } elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"})) { $overlap_query = $match2{"Query_end"} - $match1{"Query_start"}+1; } if ($overlap_query > $overlap_subject){ $overlap_length += $overlap_query; } else { $overlap_length += $overlap_subject; } $nb_covered_subject -= $overlap_subject; $nb_covered_query-= $overlap_query; } } ### Cas des tblastx ou le nb match est en aa, et les nb_covered en base if ($nb_match/$nb_covered_subject<0.34){ $nb_match = $nb_match *3; } #### $Identity = sprintf("%.2f",($nb_match-$overlap_length)*100/$nb_covered_subject); $Subject_coverage = sprintf("%.2f",$nb_covered_subject*100/$sub_length); $Query_coverage = sprintf("%.2f",$nb_covered_query*100/$q_length); if ($Subject_coverage<0.1){$stats_query_coverage{"0-10%"}++;} elsif($Subject_coverage<0.2){$stats_query_coverage{"10-20%"}++;} elsif($Subject_coverage<0.3){$stats_query_coverage{"20-30%"}++;} elsif($Subject_coverage<0.4){$stats_query_coverage{"30-40%"}++;} elsif($Subject_coverage<0.5){$stats_query_coverage{"40-50%"}++;} elsif($Subject_coverage<0.6){$stats_query_coverage{"50-60%"}++;} elsif($Subject_coverage<0.7){$stats_query_coverage{"60-70%"}++;} elsif($Subject_coverage<0.8){$stats_query_coverage{"70-80%"}++;} elsif($Subject_coverage<0.9){$stats_query_coverage{"80-90%"}++;} else{$stats_query_coverage{"90-100%"}++;} if ($VERBOSE eq "ON"){ print LF "Final\n"; print LF $Query,"\t",$Subject_Id,"\t",$orientation,"\t",$min_query,"\t",$max_query,"\t",$min_subject,"\t",$max_subject,"\t",$sub_length,"\t"; print LF "NB:",$nb_match,"\t","O:",$overlap_length,"\t","CQ:",$nb_covered_query,"\t","CS:",$nb_covered_subject,"\t",$Query_coverage,"\t",$Subject_coverage,"\t",$Identity,"\n"; } if ($subject=~/^(.*?)\s*$/){ $subject = $1; } my %current_match_joined; $current_match_joined{"Query"}=$Query; $current_match_joined{"Query_start"}=$min_query; $current_match_joined{"Query_end"}=$max_query; $current_match_joined{"Query_length"}=$q_length; $current_match_joined{"QCoverage"} = $Query_coverage; $current_match_joined{"Subject_id"}=$Subject_Id; $current_match_joined{"Subject"}=$subject; $current_match_joined{"Subject_start"}=$min_subject; $current_match_joined{"Subject_end"}=$max_subject; $current_match_joined{"Subject_length"}=$sub_length; $current_match_joined{"SCoverage"} = $Subject_coverage; $current_match_joined{"Similarity"}=$Identity; my $NBmatch = $nb_match-$overlap_length; $current_match_joined{"Nbmatch"}=$NBmatch; $current_match_joined{"Display"}="$Query\t$Subject_Id\t$orientation\t$Query_coverage%\t$Subject_coverage%\t$Identity%\t$min_query\t$max_query\t$min_subject\t$max_subject\t$NBmatch\t$q_length\t$sub_length\t$subject"; push(@match_joined,\%current_match_joined); #print OJ $match_joined{"Display"},"\n"; } my @match_joined_sorted = sort sortbyrelevanceandsubject @match_joined; for (my $i=0;$i<=$#match_joined_sorted;$i++){ my %match = %{$match_joined_sorted[$i]}; print OJ $match{"Display"},"\n"; } } #my %all_match_joined_best; #foreach my $key (sort sortkey keys %all_match_joined){ # my %match = %{$all_match_joined{$key}}; # print OJ $match{"Display"},"\n"; #} #close (OB); close (OJ); print LF "Nb query : $#query_keys\n"; print LF "Nb match : $stats_nb_match\n"; print LF "Nb match filtered included / too large overlap : $stats_included / $stats_large_overlapping \n"; print LF "Query coverage\n"; print LF "percent:\t"; foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) { print LF $key,"\t"; } print LF "\n number :\t"; foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) { print LF $stats_query_coverage{$key},"\t"; } print LF "\n"; close (LF); # for (my $i=0;$i<=$#all_match_joined;$i++){ # my $match_joined = %{$all_match_joined[$i]}; # print $match_joined{"Query"},"\t",$match_joined{"Subject"},"\t",$match_joined{"Subject_id"},"\t",$match_joined{"Similarity"},"\t",$match_joined{"Query_length"},"\t",$match_joined{"Subject_length"},"\n"; # } sub mysort{ my %matcha=%{$a}; my %matchb=%{$b}; #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n"; $matcha{"Query_start"} <=> $matchb{"Query_start"} || $matcha{"Query_end"} <=> $matchb{"Query_end"} } sub sortbyquerycoord{ my %matcha=%{$a}; my %matchb=%{$b}; #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n"; $matcha{"Query_start"} <=> $matchb{"Query_start"} || $matcha{"Query_end"} <=> $matchb{"Query_end"} } sub sortbyrelevanceandsubject{ my %matcha=%{$a}; my %matchb=%{$b}; $matchb{"Nbmatch"} <=> $matcha{"Nbmatch"} || $matchb{"QCoverage"} <=> $matcha{"QCoverage"} || $matcha{"Subject"} cmp $matchb{"Subject"} } sub sortkey { my @fieldsa = split (/\#/,$a); my @fieldsb = split (/\#/,$b); #print "$a\n$b\n"; #print $fieldsa[0]," cmp ",$fieldsb[0],"\n"; #exit(0); $fieldsa[0] cmp $fieldsb[0] || $fieldsa[1] cmp $fieldsb[1] || $fieldsa[2] <=> $fieldsb[2] }