Mercurial > repos > yusuf > associate_phenotypes
view filter_by_index_gamma @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use DB_File; use Parse::BooleanLogic; use Math::CDF qw(pgamma qgamma); # relevance score -> gamma p-value use PDL qw(pdl); use PDL::Stats::Distr qw(mme_gamma); # gamma dist parameter estimates use vars qw($parser %cached_sentences %sentence_index); my $quiet = 0; if(@ARGV and $ARGV[0] =~ /^-q/){ $quiet = 1; shift @ARGV; } @ARGV == 5 or die "Usage: $0 [-q(uiet)] <index filename base> <db name> <hgvs_annotated.txt> <output.txt> <query>\nWhere query has the format \"this or that\", \"this and that\", etc.\n"; my $signal_p = 0.95; # signal is top 5% of scores my $index_filename_base = shift @ARGV; my $db_name = shift @ARGV; my $hgvs_file = shift @ARGV; my $out_file = shift @ARGV; my $orig_query = shift @ARGV; $parser = new Parse::BooleanLogic(operators => ['and', 'or']); my $query_tree = $parser->as_array($orig_query, error_cb => sub {die "Could not parse query: @_\n"}); # For simplicity, turn the tree into a base set of or statements (which means expanding "A and (B or C)" into "A and B or A and C") a.k.a. "sum of products/minterms" my @query_terms = flatten_query($query_tree); my $df_index_filename = $index_filename_base."df_index"; my %df_index; my $df_index_handle = tie %df_index, "DB_File", $df_index_filename, O_RDONLY, 0400, $DB_BTREE or die "Cannot open $df_index_filename: $!\n"; my $gene_record_count = $df_index{"__DOC_COUNT__"}; my $sentence_index_filename = $index_filename_base."sentence_index"; my $sentence_index_handle = tie %sentence_index, "DB_File", $sentence_index_filename, O_RDONLY, 0400, $DB_HASH or die "Cannot open $sentence_index_filename: $!\n"; # Get the list of gene symbols we'll need open(HGVS, $hgvs_file) or die "Cannot open $hgvs_file for reading: $!\n"; my $header = <HGVS>; chomp $header; my @header_columns = split /\t/, $header; my ($gene_name_column, $chr_column, $from_column, $to_column); for(my $i = 0; $i <= $#header_columns; $i++){ if($header_columns[$i] eq "Gene Name"){ $gene_name_column = $i; } elsif($header_columns[$i] eq "Chr"){ $chr_column = $i; } elsif($header_columns[$i] eq "DNA From"){ $from_column = $i; } elsif($header_columns[$i] eq "DNA To"){ $to_column = $i; } } my $blank_query = not @query_terms; # Special case of empty query means print all info for variant ranges listed in the input HGVS file (assuming the DB was indexed to include chr:pos keys) if($blank_query){ #print STDERR "Running blank query\n"; if(not defined $chr_column){ die "Could not find 'Chr' column in the input header, aborting\n"; } if(not defined $from_column){ die "Could not find 'DNA From' column in the input header, aborting\n"; } if(not defined $to_column){ die "Could not find 'DNA To' column in the input header, aborting\n"; } # Build the list of locations that will need to be searched in the index open(OUT, ">$out_file") or die "Cannot open $out_file for writing: $!\n"; print OUT $header, "\t$db_name Text Matches\n"; while(<HGVS>){ chomp; my @F = split /\t/, $_, -1; my @pos_data; for my $pos ($F[$from_column]..$F[$to_column]){ # for each position in the range my $pos_match_data = fetch_sentence("$F[$chr_column]:$pos", -1); # fetch all data for this position push @pos_data, "*$F[$chr_column]:$pos* ".$pos_match_data if defined $pos_match_data; } print OUT join("\t", @F, join(" // ", @pos_data)),"\n"; } close(OUT); exit; } elsif(not defined $gene_name_column){ die "Could not find 'Gene Name' column in the input header, aborting\n"; } #print STDERR "Query terms: " , scalar(@query_terms), "\n"; my %gene_to_query_match_ranges; # Determine the set of genes that might match the query, based on the word index for my $query_term (@query_terms){ #print STDERR "Query term $query_term\n"; my %doc_hits; # how many needed words match the document? my $contiguous = 1; #by default multiword queries must be contiguous # Unless it's an AND query if($query_term =~ s/ and / /g){ $contiguous = 0; } my @words = split /\s+/, $query_term; # can be multi-word term like "mental retardation" for(my $i = 0; $i <= $#words; $i++){ my $word = mc($words[$i]); # can be a stem word, like hypoton #print STDERR "Checking word $word..."; if($i == 0){ my $first_word_docs = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem #print STDERR scalar(keys %$first_word_docs), " documents found\n"; for my $doc (keys %$first_word_docs){ $doc_hits{$doc} = $first_word_docs->{$doc}; # populate initial hit list that'll be whittled down in subsequent outer loops of multiword phrase members } next; } my @candidate_docs = keys %doc_hits; last if not @candidate_docs; # short circuit searches guaranteed to fail # each additional word must directly follow an existing match my $word_doc_offsets_ref = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem #print STDERR scalar(keys %$word_doc_offsets_ref), " documents found\n"; for my $doc (@candidate_docs){ my $num_matches = 0; if(not exists $word_doc_offsets_ref->{$doc}){ # required word missing, eliminate doc from consideration delete $doc_hits{$doc}; next; } # see if any of the instances of the additional words directly follow the last word we successfully matched my $so_far_matches_ref = $doc_hits{$doc}; my $next_word_matches_ref = $word_doc_offsets_ref->{$doc}; for (my $j=0; $j <= $#{$so_far_matches_ref}; $j++){ my $existing_match_extended = 0; next unless defined $so_far_matches_ref->[$j]->[2]; # every once in a while there is no article id parsed for (my $k=0; $k <= $#{$next_word_matches_ref}; $k++){ # Same article? next unless defined $next_word_matches_ref->[$k]->[2] and $next_word_matches_ref->[$k]->[2] eq $so_far_matches_ref->[$j]->[2]; if(not $contiguous){ $so_far_matches_ref->[$j]->[4] .= " AND ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too if(ref $so_far_matches_ref->[$j]->[3] ne "ARRAY"){ # match does not yet span multiple sentences last if $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; # same sentence $so_far_matches_ref->[$j]->[3] = [$so_far_matches_ref->[$j]->[3], $next_word_matches_ref->[$k]->[3]]; # change from scalar to array (of sentence numbers) } elsif(not grep {$_ eq $next_word_matches_ref->[$k]->[3]} @{$so_far_matches_ref->[$j]->[3]}){ push @{$so_far_matches_ref->[$j]->[3]}, $next_word_matches_ref->[$k]->[3]; # add top spanning sentences list of not already there } } # else contiguous word occurences required. # Same sentence? next unless $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; my $space_between_match_words = $next_word_matches_ref->[$k]->[0] - $so_far_matches_ref->[$j]->[1]; if($space_between_match_words <= 2){ $existing_match_extended = 1; $so_far_matches_ref->[$j]->[1] = $next_word_matches_ref->[$k]->[1]; # move the match cursor to include the new extending word $so_far_matches_ref->[$j]->[4] .= " ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too last; } elsif($space_between_match_words > 2){ # more than two typographical symbols between words, consider non-continuous last; # since the offsets are in order, any further k would only yield a larger spacing, so shortcircuit } } if(not $existing_match_extended){ splice(@$so_far_matches_ref, $j, 1); $j--; } else{ $num_matches++; } } if(not $num_matches){ delete $doc_hits{$doc}; } } } # the only keys that get to this point should be those that match all terms for my $doc (keys %doc_hits){ $gene_to_query_match_ranges{$doc} = [] if not exists $gene_to_query_match_ranges{$doc}; push @{$gene_to_query_match_ranges{$doc}}, [$query_term, @{$doc_hits{$doc}}]; } } my @matched_genes = keys %gene_to_query_match_ranges; #print STDERR "Found ", scalar(@matched_genes), "/$gene_record_count records in cached iHOP matching the query\n" unless $quiet; my %query_gene_counts; my %ntf; for my $gene (keys %gene_to_query_match_ranges){ my $max_doc_word_count = $df_index{"__DOC_MAX_WC_$gene"}; for my $count_record (@{$gene_to_query_match_ranges{$gene}}){ my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record; # next if $query_term eq $gene; # slightly controversial? exclude references to genes from the score if the gene is the record being talked about (obviously it will be highly scored) # allows us to find first degree interactors (i.e. points for "A interacts with B", in the record describing A) without creating crazy high score for doc describing gene B if B was in the original query without any phenotype query terms $query_gene_counts{$query_term}++; $ntf{$gene} = {} unless exists $ntf{$gene}; # atypical use of log in order to weigh heavy use of a common term less than occasional use of a rare term $ntf{$gene}->{$query_term} = log($#query_term_match_ranges_in_this_gene+2)/log($max_doc_word_count+1); } #print STDERR "Doc max word count is $max_doc_word_count for $gene, ntf keys = ", keys %{$ntf{$gene}}, "\n"; } my %idf; for my $query_term (@query_terms){ # convert %idf values from documents-with-the-query-term-count to actual IDF next unless exists $query_gene_counts{$query_term}; # query not in the document collection $idf{$query_term} = log($gene_record_count/$query_gene_counts{$query_term}); #print STDERR "$query_term IDF is $idf{$query_term}\n"; } # Create a relevance score using a normalized term frequency - inverse document frequency summation my %relevance_score; my %matched_query_terms; for my $gene_symbol (keys %gene_to_query_match_ranges){ my $relevance_score = 0; # Hmm, take average, sum or max of TF-IDFs? my $max_query_score = 0; my @matched_query_terms; my $query_score = 0; for (my $i = 0; $i <= $#query_terms; $i++){ my $query_term = $query_terms[$i]; next unless exists $idf{$query_term}; next unless exists $ntf{$gene_symbol}->{$query_term}; $query_score += $ntf{$gene_symbol}->{$query_term}*$idf{$query_term}; push @matched_query_terms, $query_term; $query_score *= 1-$i/scalar(@query_terms)/2 if scalar(@query_terms) > 2;# adjust the query score so the first terms are weighted more heavily if a bunch of terms are being searched $max_query_score = $query_score if $query_score > $max_query_score; $relevance_score += $query_score; } # this square root trick will not affect the score of a single term query, but will penalize a high total score that is comprised of a bunch of low value individual term scores) $relevance_score{$gene_symbol} = sqrt($relevance_score*$max_query_score); #print STDERR "Relevance score for $gene_symbol is $relevance_score{$gene_symbol}\n"; $matched_query_terms{$gene_symbol} = \@matched_query_terms; } # Characterize relevance score as a gamma statistical distribution and convert to probability my $max_relevance_score = 0; for my $relevance_score (values %relevance_score){ $max_relevance_score = $relevance_score if $relevance_score > $max_relevance_score; } # Remove top end scores as signal, characterize the rest as noise. # Iterative estimation of gamma parameters and removing data within range where CDF>99% my $noise_data = pdl(values %relevance_score); my ($shape, $scale) = $noise_data->mme_gamma(); #print STDERR "Initial gamma distribution estimates: $shape, $scale (max observation $max_relevance_score)\n"; my $signal_cutoff = qgamma($signal_p, $shape, 1/$scale); my @noise_data; for my $gene_symbol (keys %relevance_score){ my $score = $relevance_score{$gene_symbol}; push @noise_data, $score if $score < $signal_cutoff; } $noise_data = pdl(@noise_data); ($shape, $scale) = $noise_data->mme_gamma(); #print STDERR "Revised gamma distribution estimates (noise estimate at $signal_cutoff (CDF $signal_p)): $shape, $scale\n"; # Convert scores to probabilities for my $gene_symbol (keys %relevance_score){ $relevance_score{$gene_symbol} = 1-pgamma($relevance_score{$gene_symbol}, $shape, 1/$scale); } #TODO: create summary stats for each query term so the user gets an idea of each's contribution? my %pubmed_matches; for my $gene_symbol (keys %gene_to_query_match_ranges){ my $query_match_ranges_ref = $gene_to_query_match_ranges{$gene_symbol}; my %matching_sentences; for my $count_record (@$query_match_ranges_ref){ my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record; for my $occ_info (@query_term_match_ranges_in_this_gene){ my $id = $occ_info->[2]; my $sentence_number = $occ_info->[3]; my $query_match_word = $occ_info->[4]; # Fetch the preparsed sentence from the sentence index based on id and sentence number # Will automatically *HIGHLIGHT* the query terms fetched in the sentence over the course of this script if(ref $sentence_number eq "ARRAY"){ # match spans multiple sentences for my $s (@$sentence_number){ for my $word (split / AND /, $query_match_word){ #print STDERR "Highlighting $word in $id #$s for query term $query_term (multisentence match)\n"; $matching_sentences{fetch_sentence_key($id, $s, $word)}++; } } } else{ # single sentence match #print STDERR "Highlighting $query_match_word in $id #$sentence_number for query term $query_term\n"; $matching_sentences{fetch_sentence_key($id, $sentence_number, $query_match_word)}++; } } } $gene_symbol =~ s/_/\//; # didn't have a forward slash in a gene name for disk caching purposes if(keys %matching_sentences){ $pubmed_matches{$gene_symbol} = [] unless exists $pubmed_matches{$gene_symbol}; for my $new_match_ref (keys %matching_sentences){ push @{$pubmed_matches{$gene_symbol}}, $new_match_ref unless grep {$_ eq $new_match_ref} @{$pubmed_matches{$gene_symbol}}; # only put in new sentences, no need to dup } } } $orig_query =~ s/\s+/ /; # normalized whitespace $orig_query =~ s/ and / and /i; # lc() my @orig_query_terms = split /\s+or\s+/, $orig_query; open(OUT, ">$out_file") or die "Cannot open $out_file for writing: $!\n"; my $new_header = $header; $new_header .= "\t$db_name p-value (log normalized TF-IDF score, gamma dist)\t$db_name Matching Terms ($orig_query)\t$db_name Text Matches"; print OUT $new_header, "\n"; # Check if any of the variants in the annotated HGVS table are in genes from the OMIM match list while(<HGVS>){ chomp; my @F = split /\t/, $_, -1; # order the ids from highest number of sentence matches to lowest, from highest ranked term to least my (%id2match_count, %id2sentences); my @matched_genes; my $relevance_score_final = 1; my @matched_query_terms; for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ next unless exists $pubmed_matches{$gene_name}; push @matched_genes, $gene_name; for my $sentence_ref (@{$pubmed_matches{$gene_name}}){ # 0 == always fetch the title which is stored in sentence index 0 my $pubmed_record = fetch_sentence($sentence_ref); $id2match_count{$pubmed_record->[0]}++; # key = id if(not exists $id2sentences{$pubmed_record->[0]}){ $id2sentences{$pubmed_record->[0]} = {}; my $title_record = fetch_sentence(fetch_sentence_key($pubmed_record->[0], 0, "")); next unless $title_record->[0]; print STDERR "No $index_filename_base sentence number for ", $title_record->[0], "\n" if not defined $title_record->[1]; print STDERR "No $index_filename_base sentence text for ", $title_record->[0], " sentence #", $title_record->[1], "\n" if not defined $title_record->[2]; $id2sentences{$title_record->[0]}->{$title_record->[2]} = $title_record->[1]; } # Only print sentences that match a query term other than the gene name for the record key, if that gene name is part of the query my $non_self_query_ref = 0; while($pubmed_record->[2] =~ /\*(.+?)\*/g){ if($1 ne $gene_name){ $non_self_query_ref = 1; last; } } #print STDERR "rejected $gene_name self-only sentence ",$pubmed_record->[2],"\n" unless $non_self_query_ref; next unless $non_self_query_ref; $id2sentences{$pubmed_record->[0]}->{$pubmed_record->[2]} = $pubmed_record->[1]; # value = sentence order within pubmed text } $relevance_score_final *= $relevance_score{$gene_name}; push @matched_query_terms, @{$matched_query_terms{$gene_name}}; } # If we get here, there were matches my @ordered_ids = sort {$id2match_count{$b} <=> $id2match_count{$a}} keys %id2match_count; # print sentences in each id in order, with ellipsis if not contiguous my %h; print OUT join("\t", @F, ($relevance_score_final != 1 ? $relevance_score_final : ""), (@matched_query_terms ? join("; ", sort grep {not $h{$_}++} @matched_query_terms) : "")), "\t"; my $first_record = 1; for my $id (@ordered_ids){ my $sentence2order = $id2sentences{$id}; my @ordered_sentences = sort {$sentence2order->{$a} <=> $sentence2order->{$b}} keys %$sentence2order; next if scalar(@ordered_sentences) == 1; # due to self-gene only referencing filter above, we may have no matching sentences in a record. Skip in this case. if($first_record){ $first_record = 0; } else{ print OUT " // "; } my $title = shift(@ordered_sentences); print OUT "$db_name $id",(defined $title ? " $title": ""),":"; # first sentence is always the record title my $last_ordinal = 0; for my $s (@ordered_sentences){ if($last_ordinal and $sentence2order->{$s} != $last_ordinal+1){ print OUT ".."; } print OUT " ",$s; $last_ordinal = $sentence2order->{$s}; } } print OUT "\n"; } sub get_doc_offsets{ my ($db_handle, $word_stem) = @_; my %doc2offsets; my $is_uc = $word_stem =~ /^[A-Z0-9]+$/; my $has_wildcard = $word_stem =~ s/\*$//; my $value = 0; my $cursor_key = $word_stem; # retrieves the first for(my $status = $db_handle->seq($cursor_key, $value, R_CURSOR); $status == 0; $status = $db_handle->seq($cursor_key, $value, R_NEXT)){ if(CORE::index($cursor_key,$word_stem) != 0){ last; # outside the records that have the requested stem now } for my $record (split /\n/s, $value){ my ($doc, @occ_infos) = split /:/, $record; $doc2offsets{$doc} = [] if not exists $doc2offsets{$doc}; for my $occ_info (@occ_infos){ my ($term_offset, $id, $sentence_number) = split /,/, $occ_info, -1; # record start and end of word to facilitate partial key consecutive word matching algorithm used in this script push @{$doc2offsets{$doc}}, [$term_offset, $term_offset+length($cursor_key), $id, $sentence_number, $cursor_key]; } } last if $is_uc and not $has_wildcard; # only exact matches for upper case words like gene names } return \%doc2offsets; } sub mc{ if($_[0] =~ /^[A-Z][a-z]+$/){ return lc($_[0]); # sentence case normalization to lower case for regular words } else{ return $_[0]; # as-is for gene names, etc } } sub fetch_sentence_key{ my ($id, $sentence_number, $query_term) = @_; $sentence_number = 0 if not defined $sentence_number; return ":$sentence_number" if not $id; my $key = "$id:$sentence_number"; if(not exists $cached_sentences{$key}){ my @sentences = split /\n/, $sentence_index{$id}; $cached_sentences{$key} = $sentences[$sentence_number]; } $cached_sentences{$key} =~ s/\b\Q$query_term\E\b(?!\*)/"*".uc($query_term)."*"/ge unless $query_term eq ""; #print STDERR "Highlighted $query_term in $cached_sentences{$key}\n" if $query_term =~ /cirrhosis/; return $key; } sub fetch_sentence{ if(@_ == 1){ # from cache return [split(/:/, $_[0]), $cached_sentences{$_[0]}]; } else{ # if more than one arg, DIRECT FROM index key as first arg, sentence # is second arg return undef if not exists $sentence_index{$_[0]}; my @sentences = split /\n/, $sentence_index{$_[0]}; if($_[1] < 0){ # all sentences request return join("; ", @sentences); } return $sentences[$_[1]]; } } # boolean operator tree to flat expanded single depth "or" op query sub flatten_query{ my $tree = shift @_; my @or_queries; # Base case: the tree is just a leaf (denoted by a hash reference). Return value of the operand it represents. if(ref $tree eq "HASH"){ return ($tree->{"operand"}); } elsif(not ref $tree){ return $tree; } # Otherwise it's an operation array if(ref $tree ne "ARRAY"){ die "Could not parse $tree, logic error in the query parser\n"; } # Deal with AND first since it has higher precedence for (my $i = 1; $i < $#{$tree}; $i++){ if($tree->[$i] eq "and"){ my @expanded_term; my @t1_terms = flatten_query($tree->[$i-1]); my @t2_terms = flatten_query($tree->[$i+1]); #print STDERR "need to expand ", $tree->[$i-1], "(@t1_terms) AND ", $tree->[$i+1], "(@t2_terms)\n"; for my $term1 (@t1_terms){ for my $term2 (@t2_terms){ #print STDERR "Expanding to $term1 and $term2\n"; push @expanded_term, "$term1 and $term2"; } } splice(@$tree, $i-1, 3, @expanded_term); $i--; # list has been shortened } } # Should be only "OR" ops left # Resolve any OR subtrees for(my $i = 0; $i <= $#{$tree}; $i++){ next if $tree->[$i] eq "or"; push @or_queries, flatten_query($tree->[$i]); # otherwise recursive parse } return @or_queries; }