view filter_by_human_phenotype_ontology_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;

my $quiet = 0;
if(@ARGV and $ARGV[0] =~ /^-q/){
  $quiet = 1;
  shift @ARGV;
}

@ARGV == 4 or die "Usage: $0 [-q(uiet)] <human_phenotype_ontology_dir> <hgvs_annotated.txt> <output.txt> <query>\n",
                  "Where query has the format \"this or that\", \"this and that\", etc.\n",
                  "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n";

my $hpo_dir = shift @ARGV;
my $obo_file = "$hpo_dir/human-phenotype-ontology.obo";
my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt";
my $hgvs_file = shift @ARGV;
my $out_file = shift @ARGV;
my $query = shift @ARGV;

# Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives 
my %problematic = qw(HP:0003271 Visceromegaly
                     HP:0000246 Sinusitis);
# Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO
my %self_referencing = (
       "HP:0000118" => "Phenotypic abnormality",
       "HP:0000455" => "Broad nasal tip",
       "HP:0000968" => "Ectodermal dysplasia",
       "HP:0002652" => "Skeletal dysplasia",
       "HP:0003812" => "Phenotypic variability",
       "HP:0003828" => "Variable expressivity",
       "HP:0003829" => "Incomplete penetrance",
       "HP:0004472" => "Mandibular hyperostosis",
       "HP:0004495" => "Thin anteverted nares",
       "HP:0005286" => "Hypoplastic, notched nares",
       "HP:0005321" => "Mandibulofacial dysostosis",
       "HP:0005871" => "Metaphyseal chondrodysplasia",
       "HP:0007589" => "Aplasia cutis congenita on trunk or limbs",
       "HP:0007819" => "Presenile cataracts");

# Ignore metadata field words, so we can properly match gene names like HPO :-)
my %stop_words = ("name:" => 1,
                  "HPO:" => 1,
                  "alt_id:" => 1,
                  "def:" => 1,
                  "synonym:" => 1,
                  "EXACT" => 1,
                  "xref:" => 1,
                  "UMLS:" => 1,
                  "is_a:" => 1,
                  "created_by:" => 1,
                  "comment:" => 1,
                  "creation_date:" => 1);

# convert the query to a regex
my $orig_query = $query;
my $and_query = 0;
$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
  $and_query = 1;
}
$query =~ s/\s+or\s+/|/gi;
$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
#print STDERR "Query regex is $query\n" unless $quiet;


open(OBO, $obo_file)
  or die "Cannot open $obo_file for reading: $!\n";
my %matched_pheno_ids;
my %pheno_id2subtypes;
my %pheno_id2name;
my $record_count;
$/ = "\n[Term]\n";
<OBO>; # chuck header
while(<OBO>){
  next unless /^id:\s*(HP:\d+)/s;
  my $id = $1;
  next unless /\nname:\s*(.+?)\s*\n/s;
  my $name = $1;
  $pheno_id2name{$id} = $name;
  $record_count++;
  while(/\nis_a:\s*(HP:\d+)/g){
    my $parent_id = $1;
    $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
    push @{$pheno_id2subtypes{$parent_id}}, $id;
  }
  s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg;
  if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in 
                                # parent entry), only match the title to not generate too many false poositives
    while($name =~ /\b($query)(\S*?:?)/go){
      next if exists $stop_words{$1.$2};
      my $match = $1;
      $match =~ tr/\t\n/  /;
      $match =~ s/\s{2,}/ /g;
      if(not exists $matched_pheno_ids{$id}){
        $matched_pheno_ids{$id} = $match;
      } 
      elsif($matched_pheno_ids{$id} !~ /$match/){
        $matched_pheno_ids{$id} .= "; $match";
      }
    }
  }
  else{ # normally, match anywhere in the entry
    while(/\b($query)(\S*?:?)/go){
      next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id};
      my $match = $1;
      $match =~ tr/\t\n/  /;
      $match =~ s/\s{2,}/ /g;
      if(not exists $matched_pheno_ids{$id}){
        $matched_pheno_ids{$id} = $match;
      } 
      elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){
        $matched_pheno_ids{$id} .= "; $match";
      }
      #print STDERR "Match $match for $_\n";
    }
  }
}
close(OBO);
#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";


# Implements term subsumption
my @matched_pheno_ids = keys %matched_pheno_ids;
for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
  my $pheno_id = $matched_pheno_ids[$i];
  next unless exists $pheno_id2subtypes{$pheno_id};
  for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
    if(not exists $matched_pheno_ids{$sub_type_id}){
      $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
      push @matched_pheno_ids, $sub_type_id;
    }
  }
}

$/="\n"; # record separator
my %gene2pheno_ids;
# Format: entrez-gene-id<tab>entrez-gene-symbol<tab>HPO-Term-Name<tab>HPO-Term-ID
open(PHENO, $gene_pheno_file)
  or die "Cannot open $gene_pheno_file for reading: $!\n";
while(<PHENO>){
  chomp;
  my @F = split /\t/, $_;
  next unless $#F > 2; # does it have the phenotype id field?
  my $gene = $F[1];
  my $pheno_id = $F[3];
  $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene};
  push @{$gene2pheno_ids{$gene}}, $pheno_id;
}

# remove genes if they don't have a matching phenotype
for my $gene (keys %gene2pheno_ids){
  my $keep = 0;
  for my $pheno_id (@{$gene2pheno_ids{$gene}}){
    if(exists $matched_pheno_ids{$pheno_id}){
      $keep = 1;
      last;
    }
  }
  delete $gene2pheno_ids{$gene} unless $keep;
}
#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet;

$/ = "\n"; # one line at, a time from the HGVS file please!
open(HGVS, $hgvs_file)
  or die "Cannot open $hgvs_file for reading: $!\n";
my $header = <HGVS>;
chomp $header;
my @header_columns = split /\t/, $header;
my $gene_name_column;
for(my $i = 0; $i <= $#header_columns; $i++){
  if($header_columns[$i] eq "Gene Name"){
    $gene_name_column = $i;
  }
}
if(not defined $gene_name_column){
  die "Could not find 'Gene Name' column in the input header, aborting\n";
}
open(OUT, ">$out_file")
  or die "Cannot open $out_file for writing: $!\n";
print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n";

# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
while(<HGVS>){
  chomp;
  my @F = split /\t/, $_, -1;
  my (@target_phenos, @other_phenos);
  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
    next unless exists $gene2pheno_ids{$gene_name};
    for my $id (@{$gene2pheno_ids{$gene_name}}){
      next unless exists $pheno_id2name{$id};
      if(exists $matched_pheno_ids{$id}){
        push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
      }
      else{
        push @other_phenos, $pheno_id2name{$id};
      }
    }
  }
  if(@target_phenos){
    print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n";
  }
  else{
    print OUT join("\t", @F, "", ""), "\n";
  }
}
close(OUT);
close(HGVS);