annotate filter_by_index_gamma @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use DB_File;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 use Parse::BooleanLogic;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 use Math::CDF qw(pgamma qgamma); # relevance score -> gamma p-value
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 use PDL qw(pdl);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 use PDL::Stats::Distr qw(mme_gamma); # gamma dist parameter estimates
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 use vars qw($parser %cached_sentences %sentence_index);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 my $quiet = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 if(@ARGV and $ARGV[0] =~ /^-q/){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 $quiet = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 @ARGV == 5 or die "Usage: $0 [-q(uiet)] <index filename base> <db name> <hgvs_annotated.txt> <output.txt> <query>\nWhere query has the format \"this or that\", \"this and that\", etc.\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $signal_p = 0.95; # signal is top 5% of scores
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my $index_filename_base = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my $db_name = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 my $hgvs_file = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 my $out_file = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 my $orig_query = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 $parser = new Parse::BooleanLogic(operators => ['and', 'or']);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $query_tree = $parser->as_array($orig_query, error_cb => sub {die "Could not parse query: @_\n"});
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 # For simplicity, turn the tree into a base set of or statements (which means expanding "A and (B or C)" into "A and B or A and C") a.k.a. "sum of products/minterms"
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 my @query_terms = flatten_query($query_tree);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 my $df_index_filename = $index_filename_base."df_index";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 my %df_index;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 my $df_index_handle = tie %df_index, "DB_File", $df_index_filename, O_RDONLY, 0400, $DB_BTREE
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 or die "Cannot open $df_index_filename: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 my $gene_record_count = $df_index{"__DOC_COUNT__"};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 my $sentence_index_filename = $index_filename_base."sentence_index";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 my $sentence_index_handle = tie %sentence_index, "DB_File", $sentence_index_filename, O_RDONLY, 0400, $DB_HASH
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 or die "Cannot open $sentence_index_filename: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 # Get the list of gene symbols we'll need
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 open(HGVS, $hgvs_file)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 or die "Cannot open $hgvs_file for reading: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 my $header = <HGVS>;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 chomp $header;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 my @header_columns = split /\t/, $header;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my ($gene_name_column, $chr_column, $from_column, $to_column);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 for(my $i = 0; $i <= $#header_columns; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 if($header_columns[$i] eq "Gene Name"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 $gene_name_column = $i;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 elsif($header_columns[$i] eq "Chr"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 $chr_column = $i;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 elsif($header_columns[$i] eq "DNA From"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 $from_column = $i;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 elsif($header_columns[$i] eq "DNA To"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 $to_column = $i;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 my $blank_query = not @query_terms;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 # Special case of empty query means print all info for variant ranges listed in the input HGVS file (assuming the DB was indexed to include chr:pos keys)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 if($blank_query){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 #print STDERR "Running blank query\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 if(not defined $chr_column){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 die "Could not find 'Chr' column in the input header, aborting\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 if(not defined $from_column){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 die "Could not find 'DNA From' column in the input header, aborting\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 if(not defined $to_column){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 die "Could not find 'DNA To' column in the input header, aborting\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 # Build the list of locations that will need to be searched in the index
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 open(OUT, ">$out_file")
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 or die "Cannot open $out_file for writing: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 print OUT $header, "\t$db_name Text Matches\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 while(<HGVS>){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 chomp;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 my @F = split /\t/, $_, -1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 my @pos_data;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 for my $pos ($F[$from_column]..$F[$to_column]){ # for each position in the range
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 my $pos_match_data = fetch_sentence("$F[$chr_column]:$pos", -1); # fetch all data for this position
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 push @pos_data, "*$F[$chr_column]:$pos* ".$pos_match_data if defined $pos_match_data;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 print OUT join("\t", @F, join(" // ", @pos_data)),"\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 close(OUT);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 exit;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 elsif(not defined $gene_name_column){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 die "Could not find 'Gene Name' column in the input header, aborting\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 #print STDERR "Query terms: " , scalar(@query_terms), "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 my %gene_to_query_match_ranges;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 # Determine the set of genes that might match the query, based on the word index
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 for my $query_term (@query_terms){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 #print STDERR "Query term $query_term\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 my %doc_hits; # how many needed words match the document?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 my $contiguous = 1; #by default multiword queries must be contiguous
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 # Unless it's an AND query
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 if($query_term =~ s/ and / /g){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 $contiguous = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 my @words = split /\s+/, $query_term; # can be multi-word term like "mental retardation"
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 for(my $i = 0; $i <= $#words; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 my $word = mc($words[$i]); # can be a stem word, like hypoton
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 #print STDERR "Checking word $word...";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 if($i == 0){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 my $first_word_docs = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 #print STDERR scalar(keys %$first_word_docs), " documents found\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 for my $doc (keys %$first_word_docs){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 $doc_hits{$doc} = $first_word_docs->{$doc}; # populate initial hit list that'll be whittled down in subsequent outer loops of multiword phrase members
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 next;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 my @candidate_docs = keys %doc_hits;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 last if not @candidate_docs; # short circuit searches guaranteed to fail
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 # each additional word must directly follow an existing match
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 my $word_doc_offsets_ref = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 #print STDERR scalar(keys %$word_doc_offsets_ref), " documents found\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 for my $doc (@candidate_docs){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 my $num_matches = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 if(not exists $word_doc_offsets_ref->{$doc}){ # required word missing, eliminate doc from consideration
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 delete $doc_hits{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 next;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 # see if any of the instances of the additional words directly follow the last word we successfully matched
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 my $so_far_matches_ref = $doc_hits{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 my $next_word_matches_ref = $word_doc_offsets_ref->{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 for (my $j=0; $j <= $#{$so_far_matches_ref}; $j++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 my $existing_match_extended = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 next unless defined $so_far_matches_ref->[$j]->[2]; # every once in a while there is no article id parsed
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 for (my $k=0; $k <= $#{$next_word_matches_ref}; $k++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 # Same article?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 next unless defined $next_word_matches_ref->[$k]->[2] and $next_word_matches_ref->[$k]->[2] eq $so_far_matches_ref->[$j]->[2];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 if(not $contiguous){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 $so_far_matches_ref->[$j]->[4] .= " AND ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 if(ref $so_far_matches_ref->[$j]->[3] ne "ARRAY"){ # match does not yet span multiple sentences
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 last if $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; # same sentence
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 $so_far_matches_ref->[$j]->[3] = [$so_far_matches_ref->[$j]->[3], $next_word_matches_ref->[$k]->[3]]; # change from scalar to array (of sentence numbers)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 elsif(not grep {$_ eq $next_word_matches_ref->[$k]->[3]} @{$so_far_matches_ref->[$j]->[3]}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 push @{$so_far_matches_ref->[$j]->[3]}, $next_word_matches_ref->[$k]->[3]; # add top spanning sentences list of not already there
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 # else contiguous word occurences required.
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 # Same sentence?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 next unless $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 my $space_between_match_words = $next_word_matches_ref->[$k]->[0] - $so_far_matches_ref->[$j]->[1];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 if($space_between_match_words <= 2){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 $existing_match_extended = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 $so_far_matches_ref->[$j]->[1] = $next_word_matches_ref->[$k]->[1]; # move the match cursor to include the new extending word
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 $so_far_matches_ref->[$j]->[4] .= " ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 last;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 elsif($space_between_match_words > 2){ # more than two typographical symbols between words, consider non-continuous
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 last; # since the offsets are in order, any further k would only yield a larger spacing, so shortcircuit
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 if(not $existing_match_extended){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 splice(@$so_far_matches_ref, $j, 1);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 $j--;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 else{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 $num_matches++;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 if(not $num_matches){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 delete $doc_hits{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 # the only keys that get to this point should be those that match all terms
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 for my $doc (keys %doc_hits){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 $gene_to_query_match_ranges{$doc} = [] if not exists $gene_to_query_match_ranges{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 push @{$gene_to_query_match_ranges{$doc}}, [$query_term, @{$doc_hits{$doc}}];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 my @matched_genes = keys %gene_to_query_match_ranges;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 #print STDERR "Found ", scalar(@matched_genes), "/$gene_record_count records in cached iHOP matching the query\n" unless $quiet;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 my %query_gene_counts;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 my %ntf;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 for my $gene (keys %gene_to_query_match_ranges){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 my $max_doc_word_count = $df_index{"__DOC_MAX_WC_$gene"};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 for my $count_record (@{$gene_to_query_match_ranges{$gene}}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 # next if $query_term eq $gene; # slightly controversial? exclude references to genes from the score if the gene is the record being talked about (obviously it will be highly scored)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 # allows us to find first degree interactors (i.e. points for "A interacts with B", in the record describing A) without creating crazy high score for doc describing gene B if B was in the original query without any phenotype query terms
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 $query_gene_counts{$query_term}++;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 $ntf{$gene} = {} unless exists $ntf{$gene};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 # atypical use of log in order to weigh heavy use of a common term less than occasional use of a rare term
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 $ntf{$gene}->{$query_term} = log($#query_term_match_ranges_in_this_gene+2)/log($max_doc_word_count+1);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 #print STDERR "Doc max word count is $max_doc_word_count for $gene, ntf keys = ", keys %{$ntf{$gene}}, "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 my %idf;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 for my $query_term (@query_terms){ # convert %idf values from documents-with-the-query-term-count to actual IDF
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 next unless exists $query_gene_counts{$query_term}; # query not in the document collection
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 $idf{$query_term} = log($gene_record_count/$query_gene_counts{$query_term});
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 #print STDERR "$query_term IDF is $idf{$query_term}\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 # Create a relevance score using a normalized term frequency - inverse document frequency summation
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 my %relevance_score;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 my %matched_query_terms;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 for my $gene_symbol (keys %gene_to_query_match_ranges){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 my $relevance_score = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 # Hmm, take average, sum or max of TF-IDFs?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 my $max_query_score = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 my @matched_query_terms;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 my $query_score = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 for (my $i = 0; $i <= $#query_terms; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 my $query_term = $query_terms[$i];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 next unless exists $idf{$query_term};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 next unless exists $ntf{$gene_symbol}->{$query_term};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 $query_score += $ntf{$gene_symbol}->{$query_term}*$idf{$query_term};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 push @matched_query_terms, $query_term;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 $query_score *= 1-$i/scalar(@query_terms)/2 if scalar(@query_terms) > 2;# adjust the query score so the first terms are weighted more heavily if a bunch of terms are being searched
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 $max_query_score = $query_score if $query_score > $max_query_score;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 $relevance_score += $query_score;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 # this square root trick will not affect the score of a single term query, but will penalize a high total score that is comprised of a bunch of low value individual term scores)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 $relevance_score{$gene_symbol} = sqrt($relevance_score*$max_query_score);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 #print STDERR "Relevance score for $gene_symbol is $relevance_score{$gene_symbol}\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 $matched_query_terms{$gene_symbol} = \@matched_query_terms;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 # Characterize relevance score as a gamma statistical distribution and convert to probability
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 my $max_relevance_score = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 for my $relevance_score (values %relevance_score){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 $max_relevance_score = $relevance_score if $relevance_score > $max_relevance_score;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 # Remove top end scores as signal, characterize the rest as noise.
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 # Iterative estimation of gamma parameters and removing data within range where CDF>99%
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 my $noise_data = pdl(values %relevance_score);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 my ($shape, $scale) = $noise_data->mme_gamma();
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 #print STDERR "Initial gamma distribution estimates: $shape, $scale (max observation $max_relevance_score)\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 my $signal_cutoff = qgamma($signal_p, $shape, 1/$scale);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 my @noise_data;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 for my $gene_symbol (keys %relevance_score){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 my $score = $relevance_score{$gene_symbol};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 push @noise_data, $score if $score < $signal_cutoff;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 $noise_data = pdl(@noise_data);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 ($shape, $scale) = $noise_data->mme_gamma();
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 #print STDERR "Revised gamma distribution estimates (noise estimate at $signal_cutoff (CDF $signal_p)): $shape, $scale\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 # Convert scores to probabilities
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 for my $gene_symbol (keys %relevance_score){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 $relevance_score{$gene_symbol} = 1-pgamma($relevance_score{$gene_symbol}, $shape, 1/$scale);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 #TODO: create summary stats for each query term so the user gets an idea of each's contribution?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 my %pubmed_matches;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 for my $gene_symbol (keys %gene_to_query_match_ranges){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 my $query_match_ranges_ref = $gene_to_query_match_ranges{$gene_symbol};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 my %matching_sentences;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 for my $count_record (@$query_match_ranges_ref){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 for my $occ_info (@query_term_match_ranges_in_this_gene){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 my $id = $occ_info->[2];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 my $sentence_number = $occ_info->[3];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 my $query_match_word = $occ_info->[4];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 # Fetch the preparsed sentence from the sentence index based on id and sentence number
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 # Will automatically *HIGHLIGHT* the query terms fetched in the sentence over the course of this script
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 if(ref $sentence_number eq "ARRAY"){ # match spans multiple sentences
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 for my $s (@$sentence_number){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 for my $word (split / AND /, $query_match_word){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 #print STDERR "Highlighting $word in $id #$s for query term $query_term (multisentence match)\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 $matching_sentences{fetch_sentence_key($id, $s, $word)}++;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 else{ # single sentence match
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 #print STDERR "Highlighting $query_match_word in $id #$sentence_number for query term $query_term\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 $matching_sentences{fetch_sentence_key($id, $sentence_number, $query_match_word)}++;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 $gene_symbol =~ s/_/\//; # didn't have a forward slash in a gene name for disk caching purposes
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 if(keys %matching_sentences){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 $pubmed_matches{$gene_symbol} = [] unless exists $pubmed_matches{$gene_symbol};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 for my $new_match_ref (keys %matching_sentences){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 push @{$pubmed_matches{$gene_symbol}}, $new_match_ref unless grep {$_ eq $new_match_ref} @{$pubmed_matches{$gene_symbol}}; # only put in new sentences, no need to dup
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 $orig_query =~ s/\s+/ /; # normalized whitespace
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 $orig_query =~ s/ and / and /i; # lc()
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 my @orig_query_terms = split /\s+or\s+/, $orig_query;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 open(OUT, ">$out_file")
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 or die "Cannot open $out_file for writing: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 my $new_header = $header;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 $new_header .= "\t$db_name p-value (log normalized TF-IDF score, gamma dist)\t$db_name Matching Terms ($orig_query)\t$db_name Text Matches";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 print OUT $new_header, "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 # Check if any of the variants in the annotated HGVS table are in genes from the OMIM match list
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 while(<HGVS>){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 chomp;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313 my @F = split /\t/, $_, -1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314 # order the ids from highest number of sentence matches to lowest, from highest ranked term to least
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315 my (%id2match_count, %id2sentences);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316 my @matched_genes;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 my $relevance_score_final = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318 my @matched_query_terms;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320 next unless exists $pubmed_matches{$gene_name};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 push @matched_genes, $gene_name;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 for my $sentence_ref (@{$pubmed_matches{$gene_name}}){ # 0 == always fetch the title which is stored in sentence index 0
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 my $pubmed_record = fetch_sentence($sentence_ref);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 $id2match_count{$pubmed_record->[0]}++; # key = id
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 if(not exists $id2sentences{$pubmed_record->[0]}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 $id2sentences{$pubmed_record->[0]} = {};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327 my $title_record = fetch_sentence(fetch_sentence_key($pubmed_record->[0], 0, ""));
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 next unless $title_record->[0];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 print STDERR "No $index_filename_base sentence number for ", $title_record->[0], "\n" if not defined $title_record->[1];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 print STDERR "No $index_filename_base sentence text for ", $title_record->[0], " sentence #", $title_record->[1], "\n" if not defined $title_record->[2];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 $id2sentences{$title_record->[0]}->{$title_record->[2]} = $title_record->[1];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 # Only print sentences that match a query term other than the gene name for the record key, if that gene name is part of the query
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334 my $non_self_query_ref = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335 while($pubmed_record->[2] =~ /\*(.+?)\*/g){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 if($1 ne $gene_name){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 $non_self_query_ref = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 last;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341 #print STDERR "rejected $gene_name self-only sentence ",$pubmed_record->[2],"\n" unless $non_self_query_ref;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 next unless $non_self_query_ref;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343 $id2sentences{$pubmed_record->[0]}->{$pubmed_record->[2]} = $pubmed_record->[1]; # value = sentence order within pubmed text
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 $relevance_score_final *= $relevance_score{$gene_name};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 push @matched_query_terms, @{$matched_query_terms{$gene_name}};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 # If we get here, there were matches
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 my @ordered_ids = sort {$id2match_count{$b} <=> $id2match_count{$a}} keys %id2match_count;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 # print sentences in each id in order, with ellipsis if not contiguous
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353 my %h;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 print OUT join("\t", @F, ($relevance_score_final != 1 ? $relevance_score_final : ""), (@matched_query_terms ? join("; ", sort grep {not $h{$_}++} @matched_query_terms) : "")), "\t";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 my $first_record = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 for my $id (@ordered_ids){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 my $sentence2order = $id2sentences{$id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 my @ordered_sentences = sort {$sentence2order->{$a} <=> $sentence2order->{$b}} keys %$sentence2order;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 next if scalar(@ordered_sentences) == 1; # due to self-gene only referencing filter above, we may have no matching sentences in a record. Skip in this case.
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 if($first_record){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361 $first_record = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 else{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364 print OUT " // ";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 my $title = shift(@ordered_sentences);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 print OUT "$db_name $id",(defined $title ? " $title": ""),":"; # first sentence is always the record title
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 my $last_ordinal = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369 for my $s (@ordered_sentences){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 if($last_ordinal and $sentence2order->{$s} != $last_ordinal+1){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 print OUT "..";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 print OUT " ",$s;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 $last_ordinal = $sentence2order->{$s};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 print OUT "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 sub get_doc_offsets{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381 my ($db_handle, $word_stem) = @_;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 my %doc2offsets;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 my $is_uc = $word_stem =~ /^[A-Z0-9]+$/;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 my $has_wildcard = $word_stem =~ s/\*$//;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386 my $value = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 my $cursor_key = $word_stem;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 # retrieves the first
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 for(my $status = $db_handle->seq($cursor_key, $value, R_CURSOR);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 $status == 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 $status = $db_handle->seq($cursor_key, $value, R_NEXT)){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 if(CORE::index($cursor_key,$word_stem) != 0){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 last; # outside the records that have the requested stem now
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 for my $record (split /\n/s, $value){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 my ($doc, @occ_infos) = split /:/, $record;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 $doc2offsets{$doc} = [] if not exists $doc2offsets{$doc};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 for my $occ_info (@occ_infos){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 my ($term_offset, $id, $sentence_number) = split /,/, $occ_info, -1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 # record start and end of word to facilitate partial key consecutive word matching algorithm used in this script
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 push @{$doc2offsets{$doc}}, [$term_offset, $term_offset+length($cursor_key), $id, $sentence_number, $cursor_key];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 last if $is_uc and not $has_wildcard; # only exact matches for upper case words like gene names
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 return \%doc2offsets;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409 sub mc{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 if($_[0] =~ /^[A-Z][a-z]+$/){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 return lc($_[0]); # sentence case normalization to lower case for regular words
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 else{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414 return $_[0]; # as-is for gene names, etc
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 sub fetch_sentence_key{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
419 my ($id, $sentence_number, $query_term) = @_;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
420
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
421 $sentence_number = 0 if not defined $sentence_number;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
422 return ":$sentence_number" if not $id;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
423 my $key = "$id:$sentence_number";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
424 if(not exists $cached_sentences{$key}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
425 my @sentences = split /\n/, $sentence_index{$id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
426 $cached_sentences{$key} = $sentences[$sentence_number];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
427 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
428 $cached_sentences{$key} =~ s/\b\Q$query_term\E\b(?!\*)/"*".uc($query_term)."*"/ge unless $query_term eq "";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
429 #print STDERR "Highlighted $query_term in $cached_sentences{$key}\n" if $query_term =~ /cirrhosis/;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
430 return $key;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
431 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
432
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
433 sub fetch_sentence{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
434 if(@_ == 1){ # from cache
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
435 return [split(/:/, $_[0]), $cached_sentences{$_[0]}];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
436 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
437 else{ # if more than one arg, DIRECT FROM index key as first arg, sentence # is second arg
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
438 return undef if not exists $sentence_index{$_[0]};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
439 my @sentences = split /\n/, $sentence_index{$_[0]};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
440 if($_[1] < 0){ # all sentences request
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
441 return join("; ", @sentences);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
442 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
443 return $sentences[$_[1]];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
444 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
445 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
446
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
447
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
448 # boolean operator tree to flat expanded single depth "or" op query
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
449 sub flatten_query{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
450 my $tree = shift @_;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
451 my @or_queries;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
452
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
453 # Base case: the tree is just a leaf (denoted by a hash reference). Return value of the operand it represents.
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
454 if(ref $tree eq "HASH"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
455 return ($tree->{"operand"});
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
456 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
457
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
458 elsif(not ref $tree){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
459 return $tree;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
460 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
461
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
462 # Otherwise it's an operation array
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
463 if(ref $tree ne "ARRAY"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
464 die "Could not parse $tree, logic error in the query parser\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
465 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
466
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
467 # Deal with AND first since it has higher precedence
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
468 for (my $i = 1; $i < $#{$tree}; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
469 if($tree->[$i] eq "and"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
470 my @expanded_term;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
471 my @t1_terms = flatten_query($tree->[$i-1]);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
472 my @t2_terms = flatten_query($tree->[$i+1]);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
473 #print STDERR "need to expand ", $tree->[$i-1], "(@t1_terms) AND ", $tree->[$i+1], "(@t2_terms)\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
474 for my $term1 (@t1_terms){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
475 for my $term2 (@t2_terms){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
476 #print STDERR "Expanding to $term1 and $term2\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
477 push @expanded_term, "$term1 and $term2";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
478 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
479 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
480 splice(@$tree, $i-1, 3, @expanded_term);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
481 $i--; # list has been shortened
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
482 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
483 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
484 # Should be only "OR" ops left
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
485 # Resolve any OR subtrees
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
486 for(my $i = 0; $i <= $#{$tree}; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
487 next if $tree->[$i] eq "or";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
488 push @or_queries, flatten_query($tree->[$i]); # otherwise recursive parse
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
489 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
490
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
491 return @or_queries;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
492 }