# HG changeset patch # User Yusuf Ali # Date 1427311409 21600 # Node ID 6411ca16916e0acb9a2c48a15754209ef0809754 initial commit diff -r 000000000000 -r 6411ca16916e AssociatePhenotypes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AssociatePhenotypes.xml Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,128 @@ + + + + based on the medical literature + echo 1.0.0 + associate_variant_phenotypes $__tool_data_path__ pheno $input_hgvs_table + ## Handle preselected gene list of interest + #if $preselectedGenesSource.source == "file": + $preselectedGenesSource.file_of_genenames + #else: + #set $glist = str($preselectedGenesSource.genename_list).replace("__cr____cn__", " or ") + "$glist" + #end if + + ## Handle human literature terms + #if $litQuerySource.source == "file": + $litQuerySource.file_of_phenotypes + #else: + #set $qlist = str($litQuerySource.phenotype_list).replace("__cr____cn__", " or ") + "$qlist" + #end if + + ## Handle human phenotype ontology query + ##if $hpQuerySource.source == "file": + ##$hpQuerySource.file_of_mpterms + ##else: + ##set $hplist = str($hpQuerySource.autocomplete_OLS_HP).replace(";", " or ") + ##"$hplist" + ##end if + + ## Handle mouse knockout query (Mammalian Phenotype Ontology) + #if $mpQuerySource.source == "file": + $mpQuerySource.file_of_mpterms + #else: + #set $mplist = str($mpQuerySource.autocomplete_OLS_MP).replace(";", " or ") + "$mplist" + #end if + + ## Handle gene ontology terms + #if $goQuerySource.source == "file": + $goQuerySource.file_of_goterms + #else: + #set $golist = str($goQuerySource.autocomplete_OLS_GO).replace(";", " or ") + "$golist" + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tools adds columns to an HGVS table that include all of the literature references from OMIM, PubMed, ClinVar, the Human Phenotype + Ontology, the Mouse Knockout Phenotypes, and Gene Ontology that match a given set of clinical phenotype query terms. A combined + probability of gene-phenotype association is calculated to help the user rank potentially causative genes for presumed genetic disorders. + + diff -r 000000000000 -r 6411ca16916e associate_variant_phenotypes --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/associate_variant_phenotypes Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,366 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Math::CDF qw(pchisq); # chi square for calculating Fisher's method of combining p-values +use File::Basename; +my $dirname = dirname(__FILE__); + +# configuration file stuff +my %config; +my $tool_data = shift @ARGV; +if(not -e "$tool_data/associate_phenotypes.loc" ){ + system("cp $dirname/tool-data/associate_phenotypes.loc $tool_data/associate_phenotypes.loc") >> 8 and die "Could not create config file: $!\n"; +} +open CONFIG, '<', "$tool_data/associate_phenotypes.loc"; +while(){ + next if $_ =~ /^#/; + (my $key, my $value) = split(/\s+/,$_); + $config{$key} = $value; +} +close CONFIG; +my $dbs_dir = $config{"dbs_dir"}; + +@ARGV == 6 or die "Usage: $0 \n"; + +my $final_confident_outfiles_prefix = shift @ARGV; +my $confident_input = shift @ARGV; +my $preselected_genes_file = shift @ARGV; +my $human_lit_file = shift @ARGV; +my $mouse_knockout_file = shift @ARGV; +my $gene_ontology_file = shift @ARGV; + +my @genes; +if(-e $preselected_genes_file){ + open(GENES, $preselected_genes_file) + or die "Cannot open $preselected_genes_file for reading: $!\n"; + while(){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + push @genes, $_; + } + close(GENES); +} +else{ + @genes = split / or /, $preselected_genes_file; +} +my %genes; +for my $g (@genes){ + $genes{lc($g)} = 1; +} + +my @human_lit_query; +if(-e $human_lit_file){ + open(HUMAN, $human_lit_file) + or die "Cannot open $human_lit_file for reading: $!\n"; + while(){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @human_lit_query, $_; + } + close(HUMAN); +} +else{ + @human_lit_query = split / or /, $human_lit_file; +} +my $human_lit_query = join(" or ", @human_lit_query, @genes); + +my @mouse_knockout_query; +if(-e $mouse_knockout_file){ + open(MOUSE, $mouse_knockout_file) + or die "Cannot open $mouse_knockout_file for reading: $!\n"; + while(){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @mouse_knockout_query, $_; + } + close(MOUSE); +} +else{ + @mouse_knockout_query = split / or /, $mouse_knockout_file; +} +my $mouse_knockout_query = join(" or ", @mouse_knockout_query); + +my @go_query; +if(-e $gene_ontology_file){ + open(GO, $gene_ontology_file) + or die "Cannot open $gene_ontology_file for reading: $!\n"; + while(){ + chomp; + next if /^#/ or not /\S/; + s/^\s+|\s+$//g; # get rid of trailing or leading spaces + s/\s+/ /g; # normalize any other whitespace + # to do: stem query terms? exclude stop words? + push @go_query, $_; + } +} +else{ + @go_query = split / or /, $gene_ontology_file; +} +my $go_query = join(" or ", @go_query); +close(GO); + +my @cmds; + +if($human_lit_query){ + # do pubmed first because it has to potentially download references from the internet, so better to do this with just a couple concurrent rather than a lot, which would stress the remote iHOP server + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/IHOP/ PubMed $confident_input - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_susceptibility_loci_pipe $dbs_dir/GWAS/gwascatalog.txt - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/OMIM/omim.txt. OMIM - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_index_gamma $dbs_dir/ClinVar/ClinVarFullRelease.xml. ClinVar - - '$human_lit_query'"; + push @cmds, "$dirname/filter_by_human_phenotype_ontology_pipe $dbs_dir/HPO - - '$human_lit_query'"; +} + +if($mouse_knockout_query or $human_lit_query){ + if($mouse_knockout_query){ + if($human_lit_query){ + $mouse_knockout_query .= " or $human_lit_query"; + } + } + else{ + $mouse_knockout_query = $human_lit_query; + } + if($human_lit_query){ + push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 - - '$mouse_knockout_query'"; + } + else{ + push @cmds, "$dirname/filter_by_mouse_knockout_pipe $dbs_dir/MGI/2013-03-15 $confident_input - '$mouse_knockout_query'" + } +} + +if($go_query or $human_lit_query){ + if($go_query){ + if(@human_lit_query){ + $go_query .= " or ".join(" or ", @human_lit_query); + } + } + else{ + $go_query = join(" or ", @human_lit_query); + } + if($mouse_knockout_query or $human_lit_query){ + push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA - - '$go_query'"; + } + else{ + push @cmds, "$dirname/associate_phenotypes/filter_by_gene_ontology_pipe $dbs_dir/GOA $confident_input - '$go_query'"; + } +} + +&print_final_output($final_confident_outfiles_prefix, @cmds); + +# Use Fisher's Method to combine p-values from various phenotype sources into a single score for ranking +# This is an okay method to use (rather than something more complicated like Brown's method), because our +# experience with real queries is that there is surprsingly little correlation (Spearman's rank or Kendall's tau) between +# the p-values for different sources (primary or curated secondary). +sub print_final_output{ + my ($final_output_prefix, @cmds) = @_; + + my $cmd = join("|", @cmds). "|"; # pipe output so we read the stream in the handle below + open(ORIG, $cmd) + or die "Could not run '$cmd': $!\n"; + my $header = ; + chomp $header; + my @orig_header = split /\t/, $header; + my ($chr_column, $pos_column, $gene_column, $hgvs_aa_column, $maf_column, $srcs_column, @pvalue_columns, @pheno_match_columns); + for(my $i = 0; $i <= $#orig_header; $i++){ + if($orig_header[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($orig_header[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($orig_header[$i] eq "Gene Name"){ + $gene_column = $i; + } + elsif($orig_header[$i] eq "Protein HGVS"){ + $hgvs_aa_column = $i; + } + elsif($orig_header[$i] eq "Pop. freq."){ + $maf_column = $i; + } + elsif($orig_header[$i] eq "Sources"){ + $srcs_column = $i; + } + elsif($orig_header[$i] =~ /p-value/){ # columns of pheno association with a stat + push @pvalue_columns, $i; + } + elsif($orig_header[$i] =~ /\(matching/){ + push @pheno_match_columns, $i; + } + } + if(not defined $chr_column){ + die "Could not find the 'Chr' column in the header, aborting ($header)\n"; + } + elsif(not defined $pos_column){ + die "Could not find the 'DNA From' column in the header, aborting ($header)\n"; + } + elsif(not defined $hgvs_aa_column){ + die "Could not find the 'Protein HGVS' column in the header, aborting ($header)\n"; + } + elsif(not defined $maf_column){ + die "Could not find the 'Pop. freq.' column in the header, aborting ($header)\n"; + } + # Sources is optional + + # all other headers from other output files generated will be appended to the original ones + my @final_header = (@orig_header, "Combined phenotype relevance P-value"); + if(@genes){ + push @final_header, "Targeted Gene?"; + } + my %lines; # chr -> position -> [dataline1, dataline2, ...] + my %source; # no. lines per variant source + while(){ + chomp; + next unless /\S/; # ignore blank lines + my @F = split /\t/, $_, -1; # keep trailing blank fields + my $chr = $F[$chr_column]; + $chr =~ s/^chr//; # helps for sorting purposes + my $pos = $F[$pos_column]; + $pos =~ s/-.*$//; # CNVs have a range + $lines{$chr} = {} if not exists $lines{$F[$chr_column]}; + $lines{$chr}->{$pos} = [] if not exists $lines{$F[$chr_column]}->{$pos}; + my @final_dataline = @F; # fields that are the same in all files since they were in the original + for(my $i = 0; $i < $#final_dataline; $i++){ + $final_dataline[$i] = "" if not defined $final_dataline[$i]; + } + # Create aggregate phenotype relevance score using Fisher's method + # A combined p-value for k p-values (P1...Pk) is calculated using a chi-square value (with 2k degrees of freedom) derived by -2*sum(ln(Pi), i=1..k) + my $chi_sq = 0; + my $num_pvalues = 0; + my $last_pvalue = 1; + for my $pvalue_index (@pvalue_columns){ + next if $F[$pvalue_index] eq ""; + $last_pvalue = $F[$pvalue_index]; + $F[$pvalue_index] = 0.00001 if not $F[$pvalue_index]; # avoids log(0) issue + $num_pvalues++; + $chi_sq += log($F[$pvalue_index]); + } + my $fisher_pvalue = 1; + if($num_pvalues > 1){ + $chi_sq *= -2; + my $p = pchisq($chi_sq, 2*scalar(@pvalue_columns)); + if(not defined $p){ + print STDERR "($num_pvalues) No X2 test value for $chi_sq ("; + for my $pvalue_index (@pvalue_columns){ + if($F[$pvalue_index] eq ""){print STDERR "NA "} + else{print STDERR $F[$pvalue_index], " "} + } + print STDERR ")\n$_\n"; + } + $fisher_pvalue = 1-$p; + } + elsif($num_pvalues == 1){ + $fisher_pvalue = $last_pvalue; # no multiple testing correction + } + else{ + for my $match_column (@pheno_match_columns){ + next if $F[$match_column] eq ""; # give a token amount of positive score to ontology term matches + for my $match (split /\/\/|;/, $F[$match_column]){ + last if $fisher_pvalue <= 0.001; # only make better if not realy close to zero anyway + $fisher_pvalue -= 0.001; + } + } + } + push @final_dataline, abs($fisher_pvalue); + if(@genes){ + push @final_dataline, (grep({exists $genes{$_}} split(/; /, lc($F[$gene_column]))) ? "yes" : "no"); + } + push @{$lines{$chr}->{$pos}}, \@final_dataline; + + next unless defined $srcs_column and $F[$srcs_column] =~ /(?:^|\+| )(\S+?)(?=;|$)/; + $source{$1}++; + } + + my @outfiles = ("$final_output_prefix.novel.hgvs.txt", "$final_output_prefix.very_rare.hgvs.txt", "$final_output_prefix.rare.hgvs.txt", "$final_output_prefix.common.hgvs.txt"); + open(OUT_NOVEL, ">$outfiles[0]") + or die "Cannot open $outfiles[0] for writing: $!\n"; + open(OUT_VERY_RARE, ">$outfiles[1]") + or die "Cannot open $outfiles[1] for writing: $!\n"; + open(OUT_RARE, ">$outfiles[2]") + or die "Cannot open $outfiles[2] for writing: $!\n"; + open(OUT_COMMON, ">$outfiles[3]") + or die "Cannot open $outfiles[3] for writing: $!\n"; + print OUT_NOVEL join("\t", @final_header), "\n"; + print OUT_VERY_RARE join("\t", @final_header), "\n"; + print OUT_RARE join("\t", @final_header), "\n"; + print OUT_COMMON join("\t", @final_header), "\n"; + my @sorted_chrs = sort {$a =~ /^\d+$/ and $b =~ /^\d+$/ and $a <=> $b or $a cmp $b} keys %lines; + for my $chr (@sorted_chrs){ + for my $pos (sort {$a <=> $b} keys %{$lines{$chr}}){ + my $datalines_ref = $lines{$chr}->{$pos}; + # The following sorting puts all protein coding effect for a variant before non-coding ones + my @sorted_dataline_refs = sort {$a ne "NA" and $b ne "NA" and $a->[$hgvs_aa_column] cmp $a->[$hgvs_aa_column] or $b cmp $a} @$datalines_ref; + for my $dataline_ref (@sorted_dataline_refs){ + next unless defined $dataline_ref; + my $maf = $dataline_ref->[$maf_column]; + my $tot_line_length = 0; + for(my $i = 0; $i < $#{$dataline_ref}; $i++){ + if(not defined $dataline_ref->[$i]){ + $dataline_ref->[$i] = ""; # so we don't get crappy warnings of undefined values + } + else{ + $tot_line_length += length($dataline_ref->[$i]); + } + $tot_line_length++; # the tab + } + if($tot_line_length > 32000){ # Excel limit of 32767 characters in a cell. Also, undocumented bug that entire import line cannot exceeed this length. + # If we don't truncate, the rest of the line (including entire contents of cells to the right) are unceremoniously dumped. + # Note that personal experience has shown that the limit is actually a bit below this, so rounding down to the nearest 1000 for safety + my $overage = $tot_line_length - 32000; + my $sum_of_large_cells = 0; + my $num_large_cells = 0; + for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # remove contents from the largest cells + if(length($dataline_ref->[$i]) > 3200){ + $sum_of_large_cells += length($dataline_ref->[$i]); # all cells that are at least 10% of the max + $num_large_cells++; + } + } + my $cell_max_alloc = int((32000-($tot_line_length-$sum_of_large_cells))/$num_large_cells); + for(my $i = 0; $i <= $#{$dataline_ref}; $i++){ # truncate the bigger than average ones + if(length($dataline_ref->[$i]) > $cell_max_alloc){ + $dataline_ref->[$i] = substr($dataline_ref->[$i], 0, $cell_max_alloc-37)."[...remainder truncated for length]"; + } + } + } + if($maf eq "NA"){ + print OUT_NOVEL join("\t", @$dataline_ref), "\n"; + } + if($maf eq "NA" or $maf < 0.005){ + print OUT_VERY_RARE join("\t", @$dataline_ref), "\n"; + } + if($maf eq "NA" or $maf < 0.05){ + print OUT_RARE join("\t", @$dataline_ref), "\n"; + } + print OUT_COMMON join("\t", @$dataline_ref), "\n"; + } + } + } + close(OUT_NOVEL); + close(OUT_VERY_RARE); + close(OUT_RARE); + close(OUT_COMMON); + + # Print per-source tables (e.g. for each patient in a cohort) + for my $src (keys %source){ + for my $outfile (@outfiles){ + open(IN, $outfile) + or die "cannot open $outfile for reading: $!\n"; + my $src_outfile = $outfile; + $src_outfile =~ s/$final_output_prefix/$final_output_prefix-$src/; + open(OUT, ">$src_outfile") + or die "Cannot open $src_outfile for writing: $!\n"; + print OUT scalar(); # header line + while(){ + print OUT $_ if /(?:^|\+| )($src)(?=;|$)/; + } + close(OUT); + } + } +} + diff -r 000000000000 -r 6411ca16916e filter_by_gene_ontology_pipe --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_gene_ontology_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,196 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 4 or die "Usage: $0 [-q(uiet)] \n", + "Where query has the format \"this or that\", \"this and that\", etc.\n", + "GOA and OBO files must be found in the GO data dir\n"; + +my $go_dir = shift @ARGV; +my $obo_file = "$go_dir/gene_ontology.1_2.obo"; +my $hgvs_file = shift @ARGV; +my $out_file = shift @ARGV; +my $query = shift @ARGV; + +# Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems +my %problematic_terms = ("GO:0033647" => "host intracellular organelle", + "GO:0006996" => "organelle organization", + "GO:0043226" => "organelle", + "GO:0043227" => "membrane-bounded organelle", + "GO:0043229" => "intracellular organelle", + "GO:0043231" => "intracellular membrane-bounded organelle", + "GO:0044384" => "host cell outer membrane", + "GO:0044422" => "organelle part", + "GO:0044446" => "intracellular organelle part", + "GO:0045202" => "synapse", + "GO:0033648" => "host intracellular membrane-bounded organelle"); + +#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals + +# convert the query to a regex +my $orig_query = $query; +my $and_query = 0; +$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; +if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ + $and_query = 1; +} +$query =~ s/\s+or\s+/|/gi; +$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name +#print STDERR "Query regex for GO is $query\n" unless $quiet; + +open(OBO, $obo_file) + or die "Cannot open $obo_file for reading: $!\n"; +my %matched_go_ids; +my %go_id2subtypes; +my %go_id2name; +my $record_count; +$/ = "\n[Term]\n"; +; # chuck header +while(){ + next unless /^id:\s*(GO:\d+)/s; + my $id = $1; + next unless /\nname:\s*(.+?)\s*\n/s; + my $name = $1; + $go_id2name{$id} = $name; + $record_count++; + while(/\nis_a:\s*(GO:\d+)/g){ + my $parent_id = $1; + $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id}; + push @{$go_id2subtypes{$parent_id}}, $id; + } + if(exists $problematic_terms{$id}){ + if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_go_ids{$id}){ + $matched_go_ids{$id} = $match; + } + elsif($matched_go_ids{$id} !~ /$match/){ + $matched_go_ids{$id} .= "; $match"; + } + } + } + elsif(/\b($query)/o){ + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_go_ids{$id}){ + $matched_go_ids{$id} = $match; + } + elsif($matched_go_ids{$id} !~ /$match/){ + $matched_go_ids{$id} .= "; $match"; + } + #print STDERR "Found match $match for $_\n"; + } +} +close(OBO); +#print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n"; + +open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; + +# Implements term subsumption +my @matched_go_ids = keys %matched_go_ids; +for(my $i = 0; $i <= $#matched_go_ids; $i++){ + my $go_id = $matched_go_ids[$i]; + next unless exists $go_id2subtypes{$go_id}; + for my $sub_type_id (@{$go_id2subtypes{$go_id}}){ + if(not exists $matched_go_ids{$sub_type_id}){ + $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id}; + push @matched_go_ids, $sub_type_id; + } + } +} + +$/="\n"; # record separator +my %gene2go_ids; +opendir(GOADIR, $go_dir) + or die "Cannot read directory $go_dir: $!\n"; +while($_ = readdir(GOADIR)){ + next if /^\./; # hidden + next unless /\.goa$/; # not a goa formatted file + my $goa_file = $_; + open(GOA, "$go_dir/$goa_file") + or die "Cannot open $go_dir/$goa_file for reading: $!\n"; + #print STDERR "Processing file $goa_file\n"; + # example line: + # UniProtKB A0ASJ9 GO:0001664 GO_REF:0000033 ISS PANTHER:PTN000026392 F G protein alpha subunit AgGq6 A0ASJ9_ANOGA protein taxon:7165 20110125 RefGenome + while(){ + next if /^!/; # comment + chomp; + my @F = split /\t/, $_; + next unless $F[2] and $#F > 3; # does it have the gene name and go id fields? + my $genename = uc($F[2]); # standardize gene names to upper case + $genename =~ s/-//g; + my $go_id = $F[4]; + $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename}; + push @{$gene2go_ids{$genename}}, $go_id; + } + close(GOA); +} +close(GOADIR); +#print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n"; + +# remove genes if they don't have a matching go term +for my $genename (keys %gene2go_ids){ + my $keep = 0; + for my $go_id (@{$gene2go_ids{$genename}}){ + if(exists $matched_go_ids{$go_id}){ + $keep = 1; + last; + } + } + delete $gene2go_ids{$genename} unless $keep; +} +#print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet; + +$/ = "\n"; # one line at, a time from the HGVS file please! +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +my $header = ; +chomp $header; +my @header_columns = split /\t/, $header; +my $gene_name_column; +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "Gene Name"){ + $gene_name_column = $i; + } +} +if(not defined $gene_name_column){ + die "Could not find 'Gene Name' column in the input header, aborting\n"; +} + +print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n"; +# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list +while(){ + chomp; + my @F = split /\t/, $_, -1; + my (@target_gos, @other_gos); + for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ + next unless exists $gene2go_ids{$gene_name}; + for my $id (@{$gene2go_ids{$gene_name}}){ + if(exists $matched_go_ids{$id}){ + push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})"; + } + else{ + push @other_gos, $go_id2name{$id}; + } + } + } + if(@target_gos){ + my (%t,%o); + # print unique terms + print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n"; + } + else{ + print OUT join("\t", @F, "", ""), "\n"; + } +} diff -r 000000000000 -r 6411ca16916e filter_by_human_phenotype_ontology_pipe --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_human_phenotype_ontology_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,210 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 4 or die "Usage: $0 [-q(uiet)] \n", + "Where query has the format \"this or that\", \"this and that\", etc.\n", + "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n"; + +my $hpo_dir = shift @ARGV; +my $obo_file = "$hpo_dir/human-phenotype-ontology.obo"; +my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt"; +my $hgvs_file = shift @ARGV; +my $out_file = shift @ARGV; +my $query = shift @ARGV; + +# Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives +my %problematic = qw(HP:0003271 Visceromegaly + HP:0000246 Sinusitis); +# Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO +my %self_referencing = ( + "HP:0000118" => "Phenotypic abnormality", + "HP:0000455" => "Broad nasal tip", + "HP:0000968" => "Ectodermal dysplasia", + "HP:0002652" => "Skeletal dysplasia", + "HP:0003812" => "Phenotypic variability", + "HP:0003828" => "Variable expressivity", + "HP:0003829" => "Incomplete penetrance", + "HP:0004472" => "Mandibular hyperostosis", + "HP:0004495" => "Thin anteverted nares", + "HP:0005286" => "Hypoplastic, notched nares", + "HP:0005321" => "Mandibulofacial dysostosis", + "HP:0005871" => "Metaphyseal chondrodysplasia", + "HP:0007589" => "Aplasia cutis congenita on trunk or limbs", + "HP:0007819" => "Presenile cataracts"); + +# Ignore metadata field words, so we can properly match gene names like HPO :-) +my %stop_words = ("name:" => 1, + "HPO:" => 1, + "alt_id:" => 1, + "def:" => 1, + "synonym:" => 1, + "EXACT" => 1, + "xref:" => 1, + "UMLS:" => 1, + "is_a:" => 1, + "created_by:" => 1, + "comment:" => 1, + "creation_date:" => 1); + +# convert the query to a regex +my $orig_query = $query; +my $and_query = 0; +$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; +if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ + $and_query = 1; +} +$query =~ s/\s+or\s+/|/gi; +$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name +#print STDERR "Query regex is $query\n" unless $quiet; + + +open(OBO, $obo_file) + or die "Cannot open $obo_file for reading: $!\n"; +my %matched_pheno_ids; +my %pheno_id2subtypes; +my %pheno_id2name; +my $record_count; +$/ = "\n[Term]\n"; +; # chuck header +while(){ + next unless /^id:\s*(HP:\d+)/s; + my $id = $1; + next unless /\nname:\s*(.+?)\s*\n/s; + my $name = $1; + $pheno_id2name{$id} = $name; + $record_count++; + while(/\nis_a:\s*(HP:\d+)/g){ + my $parent_id = $1; + $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; + push @{$pheno_id2subtypes{$parent_id}}, $id; + } + s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg; + if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in + # parent entry), only match the title to not generate too many false poositives + while($name =~ /\b($query)(\S*?:?)/go){ + next if exists $stop_words{$1.$2}; + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/\s{2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /$match/){ + $matched_pheno_ids{$id} .= "; $match"; + } + } + } + else{ # normally, match anywhere in the entry + while(/\b($query)(\S*?:?)/go){ + next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id}; + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/\s{2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){ + $matched_pheno_ids{$id} .= "; $match"; + } + #print STDERR "Match $match for $_\n"; + } + } +} +close(OBO); +#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; + + +# Implements term subsumption +my @matched_pheno_ids = keys %matched_pheno_ids; +for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ + my $pheno_id = $matched_pheno_ids[$i]; + next unless exists $pheno_id2subtypes{$pheno_id}; + for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ + if(not exists $matched_pheno_ids{$sub_type_id}){ + $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; + push @matched_pheno_ids, $sub_type_id; + } + } +} + +$/="\n"; # record separator +my %gene2pheno_ids; +# Format: entrez-gene-identrez-gene-symbolHPO-Term-NameHPO-Term-ID +open(PHENO, $gene_pheno_file) + or die "Cannot open $gene_pheno_file for reading: $!\n"; +while(){ + chomp; + my @F = split /\t/, $_; + next unless $#F > 2; # does it have the phenotype id field? + my $gene = $F[1]; + my $pheno_id = $F[3]; + $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene}; + push @{$gene2pheno_ids{$gene}}, $pheno_id; +} + +# remove genes if they don't have a matching phenotype +for my $gene (keys %gene2pheno_ids){ + my $keep = 0; + for my $pheno_id (@{$gene2pheno_ids{$gene}}){ + if(exists $matched_pheno_ids{$pheno_id}){ + $keep = 1; + last; + } + } + delete $gene2pheno_ids{$gene} unless $keep; +} +#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet; + +$/ = "\n"; # one line at, a time from the HGVS file please! +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +my $header = ; +chomp $header; +my @header_columns = split /\t/, $header; +my $gene_name_column; +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "Gene Name"){ + $gene_name_column = $i; + } +} +if(not defined $gene_name_column){ + die "Could not find 'Gene Name' column in the input header, aborting\n"; +} +open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; +print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n"; + +# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list +while(){ + chomp; + my @F = split /\t/, $_, -1; + my (@target_phenos, @other_phenos); + for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ + next unless exists $gene2pheno_ids{$gene_name}; + for my $id (@{$gene2pheno_ids{$gene_name}}){ + next unless exists $pheno_id2name{$id}; + if(exists $matched_pheno_ids{$id}){ + push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; + } + else{ + push @other_phenos, $pheno_id2name{$id}; + } + } + } + if(@target_phenos){ + print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n"; + } + else{ + print OUT join("\t", @F, "", ""), "\n"; + } +} +close(OUT); +close(HGVS); diff -r 000000000000 -r 6411ca16916e filter_by_index_gamma --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_index_gamma Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,492 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use DB_File; +use Parse::BooleanLogic; +use Math::CDF qw(pgamma qgamma); # relevance score -> gamma p-value +use PDL qw(pdl); +use PDL::Stats::Distr qw(mme_gamma); # gamma dist parameter estimates +use vars qw($parser %cached_sentences %sentence_index); + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 5 or die "Usage: $0 [-q(uiet)] \nWhere query has the format \"this or that\", \"this and that\", etc.\n"; + +my $signal_p = 0.95; # signal is top 5% of scores +my $index_filename_base = shift @ARGV; +my $db_name = shift @ARGV; +my $hgvs_file = shift @ARGV; +my $out_file = shift @ARGV; +my $orig_query = shift @ARGV; + +$parser = new Parse::BooleanLogic(operators => ['and', 'or']); +my $query_tree = $parser->as_array($orig_query, error_cb => sub {die "Could not parse query: @_\n"}); +# For simplicity, turn the tree into a base set of or statements (which means expanding "A and (B or C)" into "A and B or A and C") a.k.a. "sum of products/minterms" +my @query_terms = flatten_query($query_tree); + +my $df_index_filename = $index_filename_base."df_index"; +my %df_index; +my $df_index_handle = tie %df_index, "DB_File", $df_index_filename, O_RDONLY, 0400, $DB_BTREE + or die "Cannot open $df_index_filename: $!\n"; +my $gene_record_count = $df_index{"__DOC_COUNT__"}; + +my $sentence_index_filename = $index_filename_base."sentence_index"; +my $sentence_index_handle = tie %sentence_index, "DB_File", $sentence_index_filename, O_RDONLY, 0400, $DB_HASH + or die "Cannot open $sentence_index_filename: $!\n"; + +# Get the list of gene symbols we'll need +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +my $header = ; +chomp $header; +my @header_columns = split /\t/, $header; +my ($gene_name_column, $chr_column, $from_column, $to_column); +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "Gene Name"){ + $gene_name_column = $i; + } + elsif($header_columns[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($header_columns[$i] eq "DNA From"){ + $from_column = $i; + } + elsif($header_columns[$i] eq "DNA To"){ + $to_column = $i; + } +} +my $blank_query = not @query_terms; +# Special case of empty query means print all info for variant ranges listed in the input HGVS file (assuming the DB was indexed to include chr:pos keys) +if($blank_query){ + #print STDERR "Running blank query\n"; + if(not defined $chr_column){ + die "Could not find 'Chr' column in the input header, aborting\n"; + } + if(not defined $from_column){ + die "Could not find 'DNA From' column in the input header, aborting\n"; + } + if(not defined $to_column){ + die "Could not find 'DNA To' column in the input header, aborting\n"; + } + # Build the list of locations that will need to be searched in the index + + open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; + print OUT $header, "\t$db_name Text Matches\n"; + + while(){ + chomp; + my @F = split /\t/, $_, -1; + my @pos_data; + for my $pos ($F[$from_column]..$F[$to_column]){ # for each position in the range + my $pos_match_data = fetch_sentence("$F[$chr_column]:$pos", -1); # fetch all data for this position + push @pos_data, "*$F[$chr_column]:$pos* ".$pos_match_data if defined $pos_match_data; + } + print OUT join("\t", @F, join(" // ", @pos_data)),"\n"; + } + close(OUT); + exit; +} +elsif(not defined $gene_name_column){ + die "Could not find 'Gene Name' column in the input header, aborting\n"; +} +#print STDERR "Query terms: " , scalar(@query_terms), "\n"; +my %gene_to_query_match_ranges; +# Determine the set of genes that might match the query, based on the word index +for my $query_term (@query_terms){ + #print STDERR "Query term $query_term\n"; + my %doc_hits; # how many needed words match the document? + my $contiguous = 1; #by default multiword queries must be contiguous + # Unless it's an AND query + if($query_term =~ s/ and / /g){ + $contiguous = 0; + } + + my @words = split /\s+/, $query_term; # can be multi-word term like "mental retardation" + for(my $i = 0; $i <= $#words; $i++){ + my $word = mc($words[$i]); # can be a stem word, like hypoton + #print STDERR "Checking word $word..."; + if($i == 0){ + my $first_word_docs = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem + #print STDERR scalar(keys %$first_word_docs), " documents found\n"; + for my $doc (keys %$first_word_docs){ + $doc_hits{$doc} = $first_word_docs->{$doc}; # populate initial hit list that'll be whittled down in subsequent outer loops of multiword phrase members + } + next; + } + my @candidate_docs = keys %doc_hits; + last if not @candidate_docs; # short circuit searches guaranteed to fail + + # each additional word must directly follow an existing match + my $word_doc_offsets_ref = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem + #print STDERR scalar(keys %$word_doc_offsets_ref), " documents found\n"; + for my $doc (@candidate_docs){ + my $num_matches = 0; + if(not exists $word_doc_offsets_ref->{$doc}){ # required word missing, eliminate doc from consideration + delete $doc_hits{$doc}; + next; + } + # see if any of the instances of the additional words directly follow the last word we successfully matched + my $so_far_matches_ref = $doc_hits{$doc}; + my $next_word_matches_ref = $word_doc_offsets_ref->{$doc}; + for (my $j=0; $j <= $#{$so_far_matches_ref}; $j++){ + my $existing_match_extended = 0; + next unless defined $so_far_matches_ref->[$j]->[2]; # every once in a while there is no article id parsed + for (my $k=0; $k <= $#{$next_word_matches_ref}; $k++){ + # Same article? + next unless defined $next_word_matches_ref->[$k]->[2] and $next_word_matches_ref->[$k]->[2] eq $so_far_matches_ref->[$j]->[2]; + if(not $contiguous){ + $so_far_matches_ref->[$j]->[4] .= " AND ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too + if(ref $so_far_matches_ref->[$j]->[3] ne "ARRAY"){ # match does not yet span multiple sentences + last if $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; # same sentence + $so_far_matches_ref->[$j]->[3] = [$so_far_matches_ref->[$j]->[3], $next_word_matches_ref->[$k]->[3]]; # change from scalar to array (of sentence numbers) + } + elsif(not grep {$_ eq $next_word_matches_ref->[$k]->[3]} @{$so_far_matches_ref->[$j]->[3]}){ + push @{$so_far_matches_ref->[$j]->[3]}, $next_word_matches_ref->[$k]->[3]; # add top spanning sentences list of not already there + } + } + # else contiguous word occurences required. + # Same sentence? + next unless $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; + + my $space_between_match_words = $next_word_matches_ref->[$k]->[0] - $so_far_matches_ref->[$j]->[1]; + if($space_between_match_words <= 2){ + $existing_match_extended = 1; + $so_far_matches_ref->[$j]->[1] = $next_word_matches_ref->[$k]->[1]; # move the match cursor to include the new extending word + $so_far_matches_ref->[$j]->[4] .= " ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too + last; + } + elsif($space_between_match_words > 2){ # more than two typographical symbols between words, consider non-continuous + last; # since the offsets are in order, any further k would only yield a larger spacing, so shortcircuit + } + } + if(not $existing_match_extended){ + splice(@$so_far_matches_ref, $j, 1); + $j--; + } + else{ + $num_matches++; + } + } + if(not $num_matches){ + delete $doc_hits{$doc}; + } + } + } + # the only keys that get to this point should be those that match all terms + for my $doc (keys %doc_hits){ + $gene_to_query_match_ranges{$doc} = [] if not exists $gene_to_query_match_ranges{$doc}; + push @{$gene_to_query_match_ranges{$doc}}, [$query_term, @{$doc_hits{$doc}}]; + } +} + +my @matched_genes = keys %gene_to_query_match_ranges; +#print STDERR "Found ", scalar(@matched_genes), "/$gene_record_count records in cached iHOP matching the query\n" unless $quiet; +my %query_gene_counts; +my %ntf; +for my $gene (keys %gene_to_query_match_ranges){ + my $max_doc_word_count = $df_index{"__DOC_MAX_WC_$gene"}; + for my $count_record (@{$gene_to_query_match_ranges{$gene}}){ + my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record; + # next if $query_term eq $gene; # slightly controversial? exclude references to genes from the score if the gene is the record being talked about (obviously it will be highly scored) + # allows us to find first degree interactors (i.e. points for "A interacts with B", in the record describing A) without creating crazy high score for doc describing gene B if B was in the original query without any phenotype query terms + $query_gene_counts{$query_term}++; + + $ntf{$gene} = {} unless exists $ntf{$gene}; + # atypical use of log in order to weigh heavy use of a common term less than occasional use of a rare term + $ntf{$gene}->{$query_term} = log($#query_term_match_ranges_in_this_gene+2)/log($max_doc_word_count+1); + } + #print STDERR "Doc max word count is $max_doc_word_count for $gene, ntf keys = ", keys %{$ntf{$gene}}, "\n"; +} + +my %idf; +for my $query_term (@query_terms){ # convert %idf values from documents-with-the-query-term-count to actual IDF + next unless exists $query_gene_counts{$query_term}; # query not in the document collection + $idf{$query_term} = log($gene_record_count/$query_gene_counts{$query_term}); + #print STDERR "$query_term IDF is $idf{$query_term}\n"; +} + +# Create a relevance score using a normalized term frequency - inverse document frequency summation +my %relevance_score; +my %matched_query_terms; +for my $gene_symbol (keys %gene_to_query_match_ranges){ + my $relevance_score = 0; + # Hmm, take average, sum or max of TF-IDFs? + my $max_query_score = 0; + my @matched_query_terms; + my $query_score = 0; + for (my $i = 0; $i <= $#query_terms; $i++){ + my $query_term = $query_terms[$i]; + next unless exists $idf{$query_term}; + next unless exists $ntf{$gene_symbol}->{$query_term}; + $query_score += $ntf{$gene_symbol}->{$query_term}*$idf{$query_term}; + push @matched_query_terms, $query_term; + $query_score *= 1-$i/scalar(@query_terms)/2 if scalar(@query_terms) > 2;# adjust the query score so the first terms are weighted more heavily if a bunch of terms are being searched + $max_query_score = $query_score if $query_score > $max_query_score; + $relevance_score += $query_score; + } + # this square root trick will not affect the score of a single term query, but will penalize a high total score that is comprised of a bunch of low value individual term scores) + $relevance_score{$gene_symbol} = sqrt($relevance_score*$max_query_score); + #print STDERR "Relevance score for $gene_symbol is $relevance_score{$gene_symbol}\n"; + $matched_query_terms{$gene_symbol} = \@matched_query_terms; +} + +# Characterize relevance score as a gamma statistical distribution and convert to probability +my $max_relevance_score = 0; +for my $relevance_score (values %relevance_score){ + $max_relevance_score = $relevance_score if $relevance_score > $max_relevance_score; +} +# Remove top end scores as signal, characterize the rest as noise. +# Iterative estimation of gamma parameters and removing data within range where CDF>99% +my $noise_data = pdl(values %relevance_score); +my ($shape, $scale) = $noise_data->mme_gamma(); +#print STDERR "Initial gamma distribution estimates: $shape, $scale (max observation $max_relevance_score)\n"; +my $signal_cutoff = qgamma($signal_p, $shape, 1/$scale); +my @noise_data; +for my $gene_symbol (keys %relevance_score){ + my $score = $relevance_score{$gene_symbol}; + push @noise_data, $score if $score < $signal_cutoff; +} +$noise_data = pdl(@noise_data); +($shape, $scale) = $noise_data->mme_gamma(); +#print STDERR "Revised gamma distribution estimates (noise estimate at $signal_cutoff (CDF $signal_p)): $shape, $scale\n"; +# Convert scores to probabilities +for my $gene_symbol (keys %relevance_score){ + $relevance_score{$gene_symbol} = 1-pgamma($relevance_score{$gene_symbol}, $shape, 1/$scale); +} + +#TODO: create summary stats for each query term so the user gets an idea of each's contribution? + +my %pubmed_matches; +for my $gene_symbol (keys %gene_to_query_match_ranges){ + my $query_match_ranges_ref = $gene_to_query_match_ranges{$gene_symbol}; + my %matching_sentences; + for my $count_record (@$query_match_ranges_ref){ + my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record; + for my $occ_info (@query_term_match_ranges_in_this_gene){ + my $id = $occ_info->[2]; + my $sentence_number = $occ_info->[3]; + my $query_match_word = $occ_info->[4]; + # Fetch the preparsed sentence from the sentence index based on id and sentence number + # Will automatically *HIGHLIGHT* the query terms fetched in the sentence over the course of this script + if(ref $sentence_number eq "ARRAY"){ # match spans multiple sentences + for my $s (@$sentence_number){ + for my $word (split / AND /, $query_match_word){ + #print STDERR "Highlighting $word in $id #$s for query term $query_term (multisentence match)\n"; + $matching_sentences{fetch_sentence_key($id, $s, $word)}++; + } + } + } + else{ # single sentence match + #print STDERR "Highlighting $query_match_word in $id #$sentence_number for query term $query_term\n"; + $matching_sentences{fetch_sentence_key($id, $sentence_number, $query_match_word)}++; + } + } + } + $gene_symbol =~ s/_/\//; # didn't have a forward slash in a gene name for disk caching purposes + if(keys %matching_sentences){ + $pubmed_matches{$gene_symbol} = [] unless exists $pubmed_matches{$gene_symbol}; + for my $new_match_ref (keys %matching_sentences){ + push @{$pubmed_matches{$gene_symbol}}, $new_match_ref unless grep {$_ eq $new_match_ref} @{$pubmed_matches{$gene_symbol}}; # only put in new sentences, no need to dup + } + } +} + +$orig_query =~ s/\s+/ /; # normalized whitespace +$orig_query =~ s/ and / and /i; # lc() +my @orig_query_terms = split /\s+or\s+/, $orig_query; + +open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; +my $new_header = $header; +$new_header .= "\t$db_name p-value (log normalized TF-IDF score, gamma dist)\t$db_name Matching Terms ($orig_query)\t$db_name Text Matches"; +print OUT $new_header, "\n"; + +# Check if any of the variants in the annotated HGVS table are in genes from the OMIM match list +while(){ + chomp; + my @F = split /\t/, $_, -1; + # order the ids from highest number of sentence matches to lowest, from highest ranked term to least + my (%id2match_count, %id2sentences); + my @matched_genes; + my $relevance_score_final = 1; + my @matched_query_terms; + for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ + next unless exists $pubmed_matches{$gene_name}; + push @matched_genes, $gene_name; + for my $sentence_ref (@{$pubmed_matches{$gene_name}}){ # 0 == always fetch the title which is stored in sentence index 0 + my $pubmed_record = fetch_sentence($sentence_ref); + $id2match_count{$pubmed_record->[0]}++; # key = id + if(not exists $id2sentences{$pubmed_record->[0]}){ + $id2sentences{$pubmed_record->[0]} = {}; + my $title_record = fetch_sentence(fetch_sentence_key($pubmed_record->[0], 0, "")); + next unless $title_record->[0]; + print STDERR "No $index_filename_base sentence number for ", $title_record->[0], "\n" if not defined $title_record->[1]; + print STDERR "No $index_filename_base sentence text for ", $title_record->[0], " sentence #", $title_record->[1], "\n" if not defined $title_record->[2]; + $id2sentences{$title_record->[0]}->{$title_record->[2]} = $title_record->[1]; + } + # Only print sentences that match a query term other than the gene name for the record key, if that gene name is part of the query + my $non_self_query_ref = 0; + while($pubmed_record->[2] =~ /\*(.+?)\*/g){ + if($1 ne $gene_name){ + $non_self_query_ref = 1; + last; + } + } + #print STDERR "rejected $gene_name self-only sentence ",$pubmed_record->[2],"\n" unless $non_self_query_ref; + next unless $non_self_query_ref; + $id2sentences{$pubmed_record->[0]}->{$pubmed_record->[2]} = $pubmed_record->[1]; # value = sentence order within pubmed text + } + $relevance_score_final *= $relevance_score{$gene_name}; + push @matched_query_terms, @{$matched_query_terms{$gene_name}}; + } + + # If we get here, there were matches + my @ordered_ids = sort {$id2match_count{$b} <=> $id2match_count{$a}} keys %id2match_count; + + # print sentences in each id in order, with ellipsis if not contiguous + my %h; + print OUT join("\t", @F, ($relevance_score_final != 1 ? $relevance_score_final : ""), (@matched_query_terms ? join("; ", sort grep {not $h{$_}++} @matched_query_terms) : "")), "\t"; + my $first_record = 1; + for my $id (@ordered_ids){ + my $sentence2order = $id2sentences{$id}; + my @ordered_sentences = sort {$sentence2order->{$a} <=> $sentence2order->{$b}} keys %$sentence2order; + next if scalar(@ordered_sentences) == 1; # due to self-gene only referencing filter above, we may have no matching sentences in a record. Skip in this case. + if($first_record){ + $first_record = 0; + } + else{ + print OUT " // "; + } + my $title = shift(@ordered_sentences); + print OUT "$db_name $id",(defined $title ? " $title": ""),":"; # first sentence is always the record title + my $last_ordinal = 0; + for my $s (@ordered_sentences){ + if($last_ordinal and $sentence2order->{$s} != $last_ordinal+1){ + print OUT ".."; + } + print OUT " ",$s; + $last_ordinal = $sentence2order->{$s}; + } + } + print OUT "\n"; +} + +sub get_doc_offsets{ + my ($db_handle, $word_stem) = @_; + my %doc2offsets; + + my $is_uc = $word_stem =~ /^[A-Z0-9]+$/; + my $has_wildcard = $word_stem =~ s/\*$//; + my $value = 0; + my $cursor_key = $word_stem; + # retrieves the first + for(my $status = $db_handle->seq($cursor_key, $value, R_CURSOR); + $status == 0; + $status = $db_handle->seq($cursor_key, $value, R_NEXT)){ + if(CORE::index($cursor_key,$word_stem) != 0){ + last; # outside the records that have the requested stem now + } + for my $record (split /\n/s, $value){ + my ($doc, @occ_infos) = split /:/, $record; + $doc2offsets{$doc} = [] if not exists $doc2offsets{$doc}; + for my $occ_info (@occ_infos){ + my ($term_offset, $id, $sentence_number) = split /,/, $occ_info, -1; + # record start and end of word to facilitate partial key consecutive word matching algorithm used in this script + push @{$doc2offsets{$doc}}, [$term_offset, $term_offset+length($cursor_key), $id, $sentence_number, $cursor_key]; + } + } + last if $is_uc and not $has_wildcard; # only exact matches for upper case words like gene names + } + return \%doc2offsets; +} + +sub mc{ + if($_[0] =~ /^[A-Z][a-z]+$/){ + return lc($_[0]); # sentence case normalization to lower case for regular words + } + else{ + return $_[0]; # as-is for gene names, etc + } +} + +sub fetch_sentence_key{ + my ($id, $sentence_number, $query_term) = @_; + + $sentence_number = 0 if not defined $sentence_number; + return ":$sentence_number" if not $id; + my $key = "$id:$sentence_number"; + if(not exists $cached_sentences{$key}){ + my @sentences = split /\n/, $sentence_index{$id}; + $cached_sentences{$key} = $sentences[$sentence_number]; + } + $cached_sentences{$key} =~ s/\b\Q$query_term\E\b(?!\*)/"*".uc($query_term)."*"/ge unless $query_term eq ""; + #print STDERR "Highlighted $query_term in $cached_sentences{$key}\n" if $query_term =~ /cirrhosis/; + return $key; +} + +sub fetch_sentence{ + if(@_ == 1){ # from cache + return [split(/:/, $_[0]), $cached_sentences{$_[0]}]; + } + else{ # if more than one arg, DIRECT FROM index key as first arg, sentence # is second arg + return undef if not exists $sentence_index{$_[0]}; + my @sentences = split /\n/, $sentence_index{$_[0]}; + if($_[1] < 0){ # all sentences request + return join("; ", @sentences); + } + return $sentences[$_[1]]; + } +} + + +# boolean operator tree to flat expanded single depth "or" op query +sub flatten_query{ + my $tree = shift @_; + my @or_queries; + + # Base case: the tree is just a leaf (denoted by a hash reference). Return value of the operand it represents. + if(ref $tree eq "HASH"){ + return ($tree->{"operand"}); + } + + elsif(not ref $tree){ + return $tree; + } + + # Otherwise it's an operation array + if(ref $tree ne "ARRAY"){ + die "Could not parse $tree, logic error in the query parser\n"; + } + + # Deal with AND first since it has higher precedence + for (my $i = 1; $i < $#{$tree}; $i++){ + if($tree->[$i] eq "and"){ + my @expanded_term; + my @t1_terms = flatten_query($tree->[$i-1]); + my @t2_terms = flatten_query($tree->[$i+1]); + #print STDERR "need to expand ", $tree->[$i-1], "(@t1_terms) AND ", $tree->[$i+1], "(@t2_terms)\n"; + for my $term1 (@t1_terms){ + for my $term2 (@t2_terms){ + #print STDERR "Expanding to $term1 and $term2\n"; + push @expanded_term, "$term1 and $term2"; + } + } + splice(@$tree, $i-1, 3, @expanded_term); + $i--; # list has been shortened + } + } + # Should be only "OR" ops left + # Resolve any OR subtrees + for(my $i = 0; $i <= $#{$tree}; $i++){ + next if $tree->[$i] eq "or"; + push @or_queries, flatten_query($tree->[$i]); # otherwise recursive parse + } + + return @or_queries; +} diff -r 000000000000 -r 6411ca16916e filter_by_mouse_knockout_pipe --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_mouse_knockout_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,202 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 4 or die "Usage: $0 [-q(uiet)] \n", + "Where query has the format \"this or that\", \"this and that\", etc.\n", + "Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n"; + +my $mgi_dir = shift @ARGV; +my $obo_file = "$mgi_dir/MPheno_OBO.ontology"; +my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt"; +my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt"; +my $hgvs_file = shift @ARGV; +my $out_file = shift @ARGV; +my $query = shift @ARGV; + +#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals +my %problematic_terms = (); + +# convert the query to a regex +my $orig_query = $query; +my $and_query = 0; +$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; +if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ + $and_query = 1; +} +$query =~ s/\s+or\s+/|/gi; +$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name +#print STDERR "Query regex is $query\n" unless $quiet; + +open(OBO, $obo_file) + or die "Cannot open $obo_file for reading: $!\n"; +my %matched_pheno_ids; +my %pheno_id2subtypes; +my %pheno_id2name; +my $record_count; +$/ = "\n[Term]\n"; +; # chuck header +while(){ + next unless /^id:\s*(MP:\d+)/s; + my $id = $1; + next unless /\nname:\s*(.+?)\s*\n/s; + my $name = $1; + $pheno_id2name{$id} = $name; + $record_count++; + while(/\nis_a:\s*(MP:\d+)/g){ + my $parent_id = $1; + $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; + push @{$pheno_id2subtypes{$parent_id}}, $id; + } + if(exists $problematic_terms{$id}){ + if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /$match/){ + $matched_pheno_ids{$id} .= "; $match"; + } + } + } + elsif(/\b($query)/o){ + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /$match/){ + $matched_pheno_ids{$id} .= "; $match"; + } + } +} +close(OBO); +#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; + +open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; + +# Implements term subsumption +my @matched_pheno_ids = keys %matched_pheno_ids; +for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ + my $pheno_id = $matched_pheno_ids[$i]; + next unless exists $pheno_id2subtypes{$pheno_id}; + for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ + if(not exists $matched_pheno_ids{$sub_type_id}){ + $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; + push @matched_pheno_ids, $sub_type_id; + } + } +} + +$/="\n"; # record separator +my %human2mouse; +# example line: +# WNT3A 89780 Wnt3a MGI:98956 MP:0003012 MP:0003631 ... MP:0010768 +open(HUMAN2MOUSE, $human_mouse_file) + or die "Cannot open $human_mouse_file for reading: $!\n"; +while(){ + my @F = split /\t/, $_; + $human2mouse{$F[0]} = $F[2]; +} +close(HUMAN2MOUSE); + +my %gene2pheno_ids; +# example line: +# Rbpj/Rbpj Rbpj involves: 129S2/SvPas * C57BL/6 MP:0001614 15466160 MGI:96522 +open(PHENO, $geno_pheno_file) + or die "Cannot open $geno_pheno_file for reading: $!\n"; +while(){ + chomp; + my @F = split /\t/, $_; + next unless $#F > 2; # does it have the phenotype id field? + my $knockout = $F[0]; + next if $knockout =~ /,/; # ignore double knockouts etc. + $knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description + my $pheno_id = $F[3]; + $gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout}; + push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]]; +} + +# remove genes if they don't have a matching phenotype +for my $gene (keys %gene2pheno_ids){ + my $keep = 0; + for my $pheno_id (@{$gene2pheno_ids{$gene}}){ + if(exists $matched_pheno_ids{$pheno_id->[0]}){ + $keep = 1; + last; + } + } + delete $gene2pheno_ids{$gene} unless $keep; +} +#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet; + +$/ = "\n"; # one line at, a time from the HGVS file please! +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +my $header = ; +chomp $header; +my @header_columns = split /\t/, $header; +my $gene_name_column; +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "Gene Name"){ + $gene_name_column = $i; + } +} +if(not defined $gene_name_column){ + die "Could not find 'Gene Name' column in the input header, aborting\n"; +} +print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n"; + +# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list +while(){ + chomp; + my @F = split /\t/, $_, -1; + my (%target_phenos, %other_phenos); + for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ + next unless exists $human2mouse{$gene_name}; + next unless exists $gene2pheno_ids{$human2mouse{$gene_name}}; + for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){ + my ($id, $pmid) = @$pheno_id; + if(exists $matched_pheno_ids{$id}){ + $target_phenos{$pmid} = [] unless exists $target_phenos{$pmid}; + push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; + } + else{ + $other_phenos{$pmid} = [] unless exists $other_phenos{$pmid}; + push @{$other_phenos{$pmid}}, $pheno_id2name{$id}; + } + } + } + if(%target_phenos){ + print OUT join("\t", @F); + print OUT "\t"; + my $count = 0; + for my $pmid (keys %target_phenos){ + print OUT " // " if $count++; + print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}}); + } + print OUT "\t"; + $count = 0; + for my $pmid (keys %other_phenos){ + print OUT " // " if $count++; + print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}}); + } + print OUT "\n"; + } + else{ + print OUT join("\t", @F, "", ""), "\n"; + } +} +close(HGVS); +close(OUT); diff -r 000000000000 -r 6411ca16916e filter_by_susceptibility_loci_pipe --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_susceptibility_loci_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,174 @@ +#!/usr/bin/env perl + +# Reports SNPs associated with susceptibility loci determined by GWAS +use strict; +use warnings; + +@ARGV == 4 or die "Usage: $0 \n"; +my $gwas_file = shift @ARGV; +my $hgvs_file = shift @ARGV; +my $outfile = shift @ARGV; +my $pheno_query = shift @ARGV; + +$pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace +my $s; +$pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg; +my @or_terms = split /\s+or\s+/i, $pheno_query; +for my $term (@or_terms){ + $term = and_query($term); # recursively convert Boolean AND to equivalent regex +} + +# Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease/Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95% CI (text) Platform [SNPs passing QC] CNV +open(GWAS, $gwas_file) + or die "Cannot open GWAS file for reading: $!\n"; +my $header = ; +chomp $header; +my @columns = split /\t/, $header; +my $trait_column_index = -1; +my $study_column_index = -1; +my $chr_column_index = -1; +my $pos_column_index = -1; +my $allele_column_index = -1; +my $context_column_index = -1; +my $odds_column_index = -1; +my $pubmed_column_index = -1; +for(my $i = 0; $i < $#columns; $i++){ + if($columns[$i] eq "Disease/Trait"){ + $trait_column_index = $i; + } + elsif($columns[$i] eq "Study"){ + $study_column_index = $i; + } + elsif($columns[$i] eq "Chr_id"){ + $chr_column_index = $i; + } + elsif($columns[$i] eq "Chr_pos"){ + $pos_column_index = $i; + } + elsif($columns[$i] eq "Strongest SNP-Risk Allele"){ + $allele_column_index = $i; + } + elsif($columns[$i] eq "p-Value (text)"){ + $context_column_index = $i; + } + elsif($columns[$i] eq "OR or beta"){ + $odds_column_index = $i; + } + elsif($columns[$i] eq "PUBMEDID"){ + $pubmed_column_index = $i; + } +} +if($trait_column_index == -1){ + die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($study_column_index == -1){ + die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($chr_column_index == -1){ + die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($pos_column_index == -1){ + die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($allele_column_index == -1){ + die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($context_column_index == -1){ + die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($odds_column_index == -1){ + die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($pubmed_column_index == -1){ + die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; +} + +my %snp2desc; +while(){ + chomp; + next unless /\S/; + my @F = split /\t/, $_; + my $chr = $F[$chr_column_index]; + $chr =~ s/^chr//; # drop prefix if present + my $strongest_risk_allele = $F[$allele_column_index]; + $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter + $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]]; +} +close(GWAS); + +open(MATCHOUT, ">$outfile") + or die "Cannot open $outfile for writing: $!\n"; + +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +$header = ; +chomp $header; +my @header_columns = split /\t/, $header; +my ($chr_column, $obs_column, $pos_column); +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($header_columns[$i] eq "Obs base"){ + $obs_column = $i; + } + elsif($header_columns[$i] eq "Chr"){ + $chr_column = $i; + } +} +if(not defined $chr_column){ + die "Could not find 'Chr' column in the input header, aborting\n"; +} +if(not defined $pos_column){ + die "Could not find 'DNA From' column in the input header, aborting\n"; +} +if(not defined $obs_column){ + die "Could not find 'Obs base' column in the input header, aborting\n"; +} + +print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n"; +while(){ + chomp; + my @F = split /\t/, $_, -1; + my $allele = $F[$obs_column]; + $allele =~ s(/.*$)(); # ignore ref allele in het calls + my $chr = $F[$chr_column]; + $chr =~ s/^chr//; # remove chr name prefix if present + my $key = $chr.":".$F[$pos_column].":".$allele; + #print STDERR "$key\n"; + if(not exists $snp2desc{$key}){ + print MATCHOUT join("\t", @F, "", "", ""), "\n"; + next; + } + my @matches; + for my $term (@or_terms){ + next if grep {$_ eq $term} @matches; + #print STDERR "Checking match for term $term...\n"; + if($snp2desc{$key}->[0] =~ /\b$term/i or # trait + $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context + $snp2desc{$key}->[2] =~ /\b$term/i){ # study name + push @matches, $term; + last; + } + } + chomp $F[$#F]; + # print all GWAS, whether they match query or not + print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n"; +} +close(HGVS); +close(MATCHOUT); + +sub and_query{ + my ($query) = @_; + if($query =~ /^(.+)\s+and\s+(.+)$/){ + my $t1 = $1; + my $t2 = $2; + my $term1 = and_query($t1); + my $term2 = and_query($t2); + return "($term1.*$term2|$term2.*$term1)"; + } + else{ + return $query; # base case: single term + } + return +} diff -r 000000000000 -r 6411ca16916e tool-data/associate_phenotypes.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/associate_phenotypes.loc Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,1 @@ +dbs_dir /export/achri_galaxy/dbs/