Mercurial > repos > mcharles > genephys
diff genephys/MergeBlastResults.pl @ 3:8dfa09868059 draft
Uploaded
author | mcharles |
---|---|
date | Fri, 24 Oct 2014 05:54:20 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/MergeBlastResults.pl Fri Oct 24 05:54:20 2014 -0400 @@ -0,0 +1,545 @@ +#!/usr/bin/perl +#V1.0.3 header added +#V1.0.2 suppressed the final sort (very heavyload) and replaced it by another level of hash +#V1.0.1 added log, option parameters +use strict; +use warnings; +use Getopt::Long; + +my $inputblast; +my $outputjoin; +my $log_file; +my $MAX_OVERLAP_FRACTION = 0.5; +my $MAX_OVERLAP_LENGTH_IGNORED = 3; +my $VERBOSE = "OFF"; +my $ALLOWED_GAP_FRACTION_FOR_MERGING = 0.3; +my $HEADER =""; + +GetOptions ( +"input_blasttab_file=s" => \$inputblast, +"output_joinmatch_file=s" => \$outputjoin, +"log_file=s" => \$log_file, +"header=s" => \$HEADER, +"max_overlap_fraction=f" => \$MAX_OVERLAP_FRACTION, +"max_overlap_length_ignored=i" =>\$MAX_OVERLAP_LENGTH_IGNORED +) or die("Error in command line arguments\n"); + +open(IB, $inputblast) or die ("Can't open $inputblast \n"); +open (LF,">$log_file") or die("Can't open $log_file\n"); + + +my %match_by_query; + +my @query_keys; +my %querys; + + +my $stats_nb_match=0; +my $stats_included=0; +my $stats_large_overlapping=0; +my %stats_query_coverage; +$stats_query_coverage{"0-10%"}=0; +$stats_query_coverage{"10-20%"}=0; +$stats_query_coverage{"20-30%"}=0; +$stats_query_coverage{"30-40%"}=0; +$stats_query_coverage{"40-50%"}=0; +$stats_query_coverage{"50-60%"}=0; +$stats_query_coverage{"60-70%"}=0; +$stats_query_coverage{"70-80%"}=0; +$stats_query_coverage{"80-90%"}=0; +$stats_query_coverage{"90-100%"}=0; + +my $current_query=""; +while (my $ligne = <IB>){ + my @fields = split (/\t/,$ligne); + if ($#fields != 9){ + print STDERR "Invalid blasttab format, must have 10 columns\n"; + exit(0); + } + $stats_nb_match++; + my %match; + $match{"Query"}=$fields[0]; + if (!$querys{$match{"Query"}}){ + push(@query_keys,$match{"Query"}); + $querys{$match{"Query"}} = 1; + } + $match{"Subject_id"}=$fields[1]; + $match{"Orientation"}="+"; + $match{"Query_start"}=$fields[2]; + $match{"Query_end"}=$fields[3]; + $match{"Subject_start"}=$fields[4]; + $match{"Subject_end"}=$fields[5]; + + + if ($fields[2]>$fields[3]){ + $match{"Query_start"}=$fields[3]; + $match{"Query_end"}=$fields[2]; + $match{"Orientation"}="-"; + #print "- $ligne"; + } + + if ($fields[4]>$fields[5]){ + $match{"Subject_start"}=$fields[5]; + $match{"Subject_end"}=$fields[4]; + $match{"Orientation"}="-"; + #print "- $ligne"; + } + $match{"Similarity"}=$fields[6]; + $match{"Query_length"}=$fields[7]; + $match{"Subject_length"}=$fields[8]; + chomp($fields[9]); + $match{"Subject"}=$fields[9]; + + $match{"Ligne"}=$ligne; + + my $querykey = $match{"Query"}; + my $key = $match{"Query"}."##".$match{"Subject"}."##".$match{"Orientation"}; + if ($match{"Subject_length"}==0){ + print LF "Match 0",$ligne,"\n",$match{"Subject_length"},"\n"; + } + my %match_by_query_and_subject; + my @match_table; + + if ($match_by_query{$querykey}){ + %match_by_query_and_subject = %{$match_by_query{$querykey}}; + } + if ($match_by_query_and_subject{$key}){ + @match_table=@{$match_by_query_and_subject{$key}}; + } + + push (@match_table,\%match); + $match_by_query_and_subject{$key} = \@match_table; + $match_by_query{$querykey}= \%match_by_query_and_subject; +} + +close (IB); + +#print LF "NB query : $#query_keys\n"; +#foreach my $querykey (sort @query_keys){ +# my %current_match_by_query_and_subject = %{$match_by_query{$querykey}}; +# foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){ +# my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}}; +# for (my $i=0;$i<=$#current_match_table;$i++){ +# my %current_match = %{$current_match_table[$i]}; +# print LF $current_match{"Ligne"}."\n"; +# } +# } +#} +#exit(0); + +open (OJ, ">$outputjoin") or die ("Can't open $outputjoin \n"); +print OJ "##",$HEADER,"\n"; +print OJ "#Query\tSubject_Id\torientation\tQuery_coverage\tSubject_coverage\tIdentity\tmin_query\tmax_query\tmin_subject\tmax_subject\tNBmatch\tq_length\tsub_length\tsubject\n"; + +foreach my $querykey (sort @query_keys){ + my %current_match_by_query_and_subject = %{$match_by_query{$querykey}}; + my @match_joined; #la table qui contient les matchs (hash) projeté de chaque subject pour une query + foreach my $key (sort {$a cmp $b} keys %current_match_by_query_and_subject){ + my @current_match_table = sort sortbyquerycoord @{$current_match_by_query_and_subject{$key}}; + my @duplicate; + my @overlap; + for (my $i=0;$i<=$#current_match_table;$i++){ + push (@duplicate,0); + } + if ($VERBOSE eq "ON"){ + print LF "\nTable Match ($#current_match_table)\t$key\n"; + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match=%{$current_match_table[$i]}; + print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + } + } + + #Scan d'inclusion strict + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match1=%{$current_match_table[$i]}; + for (my $j=0;$j<=$#current_match_table;$j++){ + if (($j != $i)&&($duplicate[$j]==0)){ # On scan dans les deux sens, pas seulement $j = $i+1 a cause du last; + my %match2=%{$current_match_table[$j]}; + # Inclus Subject + if (($match1{"Subject_start"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + $duplicate[$i]=1; + last; + } + # Inclus Query + elsif (($match1{"Query_start"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + $duplicate[$i]=2; + last; + } + + } + } + } + + my @current_match_table_filtered; + for (my $i=0;$i<=$#current_match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@current_match_table_filtered,$current_match_table[$i]); + } + else{ + $stats_included++; + } + } + + if ($#current_match_table > $#current_match_table_filtered){ + @current_match_table = @current_match_table_filtered; + if ($VERBOSE eq "ON"){ + print LF "Table Match filtered 1 ($#current_match_table)\n"; + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match=%{$current_match_table[$i]}; + print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + } + } + } + + undef @duplicate; + for (my $i=0;$i<=$#current_match_table;$i++){ + push (@duplicate,0); + } + ######## + + ###Scan d'overlap trop important + # D'abord subject + for (my $i=0;$i<=$#current_match_table-1;$i++){ + my %match1=%{$current_match_table[$i]}; + for (my $j=$i+1;$j<=$#current_match_table;$j++){ + my %match2=%{$current_match_table[$j]}; + if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + my $overlap_length = $match1{"Subject_end"} - $match2{"Subject_start"}+1; + my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; + my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + } + else { + } + } + elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) #Recherche d'inclusion dans les deux sens car els match sont classé par query coord, par par subject coord + { + my $overlap_length = $match2{"Subject_end"} - $match1{"Subject_start"}+1; + my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; + my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + } + else { + } + } + } + } + + undef @current_match_table_filtered; + for (my $i=0;$i<=$#current_match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@current_match_table_filtered,$current_match_table[$i]); + } + else { + $stats_large_overlapping++; + } + } + + if ($#current_match_table > $#current_match_table_filtered){ + @current_match_table = @current_match_table_filtered; + if ($VERBOSE eq "ON"){ + print LF "Table Match filtered 2 ($#current_match_table)\n"; + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match=%{$current_match_table[$i]}; + print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + } + } + + } + + undef @duplicate; + for (my $i=0;$i<=$#current_match_table;$i++){ + push (@duplicate,0); + } + + # Ensuite Query (Subject puis Query pour evitez des deletions complementaires query update qui enleverait toutes les entrées) + + for (my $i=0;$i<=$#current_match_table-1;$i++){ + my %match1=%{$current_match_table[$i]}; + for (my $j=$i+1;$j<=$#current_match_table;$j++){ + my %match2=%{$current_match_table[$j]}; + if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + my $overlap_length = $match1{"Query_end"} - $match2{"Query_start"}+1; + my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1; + my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + } + else { + } + } + #un seul sans les query sont classés + } + } + + undef @current_match_table_filtered; + for (my $i=0;$i<=$#current_match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@current_match_table_filtered,$current_match_table[$i]); + } + else { + $stats_large_overlapping++; + } + } + + if ($#current_match_table > $#current_match_table_filtered){ + @current_match_table = @current_match_table_filtered; + if ($VERBOSE eq "ON"){ + print LF "Table Match filtered 3 ($#current_match_table)\n"; + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match=%{$current_match_table[$i]}; + print LF $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print LF $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + } + } + } + + + + #Fusion des Hsp et Calcul des nouvelles metriques + + my $overlap_length = 0; + my $Subject_coverage; + my $Query_coverage; + my $Identity; + my $nb_match = 0; + + + my $nb_covered_subject=0; + my $nb_covered_query=0; + my %match=%{$current_match_table[0]}; + my $q_length = $match{"Query_length"}; + my $sub_length = $match{"Subject_length"}; + my $subject = $match{"Subject"}; + my $min_query = $match{"Query_start"}; + my $max_query = $match{"Query_end"}; + my $min_subject = $match{"Subject_start"}; + my $max_subject = $match{"Subject_end"}; + my $orientation = $match{"Orientation"}; + my $Query = $match{"Query"}; + my $Subject_Id = $match{"Subject_id"}; + my $Subject = $match{"Subject"}; + + + for (my $i=0;$i<=$#current_match_table;$i++){ + my %match1=%{$current_match_table[$i]}; + $nb_covered_subject += $match1{"Subject_end"} - $match1{"Subject_start"} +1; + $nb_covered_query += $match1{"Query_end"} - $match1{"Query_start"} +1; + + if ($match1{"Query_start"}<$min_query){ + $min_query = $match1{"Query_start"}; + } + if ($match1{"Query_end"}>$max_query){ + $max_query = $match1{"Query_end"}; + } + if ($match1{"Subject_start"}<$min_subject){ + $min_subject = $match1{"Subject_start"}; + } + if ($match1{"Subject_end"}>$max_subject){ + $max_subject = $match1{"Subject_end"}; + } + + $nb_match += $match1{"Similarity"}; + + for (my $j=$i+1;$j<=$#current_match_table;$j++){ + my $overlap_query=0; + my $overlap_subject=0; + my %match2=%{$current_match_table[$j]}; + + if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + $overlap_subject = $match1{"Subject_end"} - $match2{"Subject_start"}+1; + } + elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) + { + $overlap_subject = $match2{"Subject_end"} - $match1{"Subject_start"}+1; + } + + if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + $overlap_query = $match1{"Query_end"} - $match2{"Query_start"} +1; + } + elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"})) + { + $overlap_query = $match2{"Query_end"} - $match1{"Query_start"}+1; + } + + if ($overlap_query > $overlap_subject){ + $overlap_length += $overlap_query; + } + else { + $overlap_length += $overlap_subject; + } + + $nb_covered_subject -= $overlap_subject; + $nb_covered_query-= $overlap_query; + } + + } + ### Cas des tblastx ou le nb match est en aa, et les nb_covered en base + if ($nb_match/$nb_covered_subject<0.34){ + $nb_match = $nb_match *3; + } + #### + + $Identity = sprintf("%.2f",($nb_match-$overlap_length)*100/$nb_covered_subject); + + $Subject_coverage = sprintf("%.2f",$nb_covered_subject*100/$sub_length); + $Query_coverage = sprintf("%.2f",$nb_covered_query*100/$q_length); + + if ($Subject_coverage<0.1){$stats_query_coverage{"0-10%"}++;} + elsif($Subject_coverage<0.2){$stats_query_coverage{"10-20%"}++;} + elsif($Subject_coverage<0.3){$stats_query_coverage{"20-30%"}++;} + elsif($Subject_coverage<0.4){$stats_query_coverage{"30-40%"}++;} + elsif($Subject_coverage<0.5){$stats_query_coverage{"40-50%"}++;} + elsif($Subject_coverage<0.6){$stats_query_coverage{"50-60%"}++;} + elsif($Subject_coverage<0.7){$stats_query_coverage{"60-70%"}++;} + elsif($Subject_coverage<0.8){$stats_query_coverage{"70-80%"}++;} + elsif($Subject_coverage<0.9){$stats_query_coverage{"80-90%"}++;} + else{$stats_query_coverage{"90-100%"}++;} + + if ($VERBOSE eq "ON"){ + print LF "Final\n"; + print LF $Query,"\t",$Subject_Id,"\t",$orientation,"\t",$min_query,"\t",$max_query,"\t",$min_subject,"\t",$max_subject,"\t",$sub_length,"\t"; + print LF "NB:",$nb_match,"\t","O:",$overlap_length,"\t","CQ:",$nb_covered_query,"\t","CS:",$nb_covered_subject,"\t",$Query_coverage,"\t",$Subject_coverage,"\t",$Identity,"\n"; + + } + + if ($subject=~/^(.*?)\s*$/){ + $subject = $1; + } + my %current_match_joined; + $current_match_joined{"Query"}=$Query; + $current_match_joined{"Query_start"}=$min_query; + $current_match_joined{"Query_end"}=$max_query; + $current_match_joined{"Query_length"}=$q_length; + $current_match_joined{"QCoverage"} = $Query_coverage; + $current_match_joined{"Subject_id"}=$Subject_Id; + $current_match_joined{"Subject"}=$subject; + $current_match_joined{"Subject_start"}=$min_subject; + $current_match_joined{"Subject_end"}=$max_subject; + $current_match_joined{"Subject_length"}=$sub_length; + $current_match_joined{"SCoverage"} = $Subject_coverage; + $current_match_joined{"Similarity"}=$Identity; + my $NBmatch = $nb_match-$overlap_length; + $current_match_joined{"Nbmatch"}=$NBmatch; + $current_match_joined{"Display"}="$Query\t$Subject_Id\t$orientation\t$Query_coverage%\t$Subject_coverage%\t$Identity%\t$min_query\t$max_query\t$min_subject\t$max_subject\t$NBmatch\t$q_length\t$sub_length\t$subject"; + + push(@match_joined,\%current_match_joined); + #print OJ $match_joined{"Display"},"\n"; + } + my @match_joined_sorted = sort sortbyrelevanceandsubject @match_joined; + for (my $i=0;$i<=$#match_joined_sorted;$i++){ + my %match = %{$match_joined_sorted[$i]}; + print OJ $match{"Display"},"\n"; + } +} + + +#my %all_match_joined_best; + +#foreach my $key (sort sortkey keys %all_match_joined){ +# my %match = %{$all_match_joined{$key}}; +# print OJ $match{"Display"},"\n"; +#} + +#close (OB); +close (OJ); + + +print LF "Nb query : $#query_keys\n"; +print LF "Nb match : $stats_nb_match\n"; +print LF "Nb match filtered included / too large overlap : $stats_included / $stats_large_overlapping \n"; +print LF "Query coverage\n"; +print LF "percent:\t"; +foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) { + print LF $key,"\t"; +} +print LF "\n number :\t"; +foreach my $key (sort {$a cmp $b} keys %stats_query_coverage) { + print LF $stats_query_coverage{$key},"\t"; +} +print LF "\n"; + + +close (LF); + + +# for (my $i=0;$i<=$#all_match_joined;$i++){ + # my $match_joined = %{$all_match_joined[$i]}; + # print $match_joined{"Query"},"\t",$match_joined{"Subject"},"\t",$match_joined{"Subject_id"},"\t",$match_joined{"Similarity"},"\t",$match_joined{"Query_length"},"\t",$match_joined{"Subject_length"},"\n"; +# } + + +sub mysort{ + my %matcha=%{$a}; + my %matchb=%{$b}; + + #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n"; + + $matcha{"Query_start"} <=> $matchb{"Query_start"} + || + $matcha{"Query_end"} <=> $matchb{"Query_end"} + +} + +sub sortbyquerycoord{ + my %matcha=%{$a}; + my %matchb=%{$b}; + + #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n"; + + $matcha{"Query_start"} <=> $matchb{"Query_start"} + || + $matcha{"Query_end"} <=> $matchb{"Query_end"} + +} + +sub sortbyrelevanceandsubject{ + my %matcha=%{$a}; + my %matchb=%{$b}; + + $matchb{"Nbmatch"} <=> $matcha{"Nbmatch"} + || + $matchb{"QCoverage"} <=> $matcha{"QCoverage"} + || + $matcha{"Subject"} cmp $matchb{"Subject"} +} + + +sub sortkey { + my @fieldsa = split (/\#/,$a); + my @fieldsb = split (/\#/,$b); + + #print "$a\n$b\n"; + #print $fieldsa[0]," cmp ",$fieldsb[0],"\n"; + #exit(0); + + $fieldsa[0] cmp $fieldsb[0] + || + $fieldsa[1] cmp $fieldsb[1] + || + $fieldsa[2] <=> $fieldsb[2] + +}