view filter_by_gene_ontology_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;

my $quiet = 0;
if(@ARGV and $ARGV[0] =~ /^-q/){
  $quiet = 1;
  shift @ARGV;
}

@ARGV == 4 or die "Usage: $0 [-q(uiet)] <GO data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
                  "Where query has the format \"this or that\", \"this and that\", etc.\n",
                  "GOA and OBO files must be found in the GO data dir\n";

my $go_dir = shift @ARGV;
my $obo_file = "$go_dir/gene_ontology.1_2.obo";
my $hgvs_file = shift @ARGV;
my $out_file = shift @ARGV;
my $query = shift @ARGV;

# Terms with meronyms and hyponyms in their free text descriptions, causing overgeneralization problems
my %problematic_terms = ("GO:0033647" => "host intracellular organelle",
                         "GO:0006996" => "organelle organization",
                         "GO:0043226" => "organelle",
                         "GO:0043227" => "membrane-bounded organelle",
                         "GO:0043229" => "intracellular organelle",
                         "GO:0043231" => "intracellular membrane-bounded organelle",
                         "GO:0044384" => "host cell outer membrane",
                         "GO:0044422" => "organelle part",
                         "GO:0044446" => "intracellular organelle part",
                         "GO:0045202" => "synapse",
                         "GO:0033648" => "host intracellular membrane-bounded organelle");

#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals

# convert the query to a regex
my $orig_query = $query;
my $and_query = 0;
$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
  $and_query = 1;
}
$query =~ s/\s+or\s+/|/gi;
$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
#print STDERR "Query regex for GO is $query\n" unless $quiet;

open(OBO, $obo_file)
  or die "Cannot open $obo_file for reading: $!\n";
my %matched_go_ids;
my %go_id2subtypes;
my %go_id2name;
my $record_count;
$/ = "\n[Term]\n";
<OBO>; # chuck header
while(<OBO>){
  next unless /^id:\s*(GO:\d+)/s;
  my $id = $1;
  next unless /\nname:\s*(.+?)\s*\n/s;
  my $name = $1;
  $go_id2name{$id} = $name;
  $record_count++;
  while(/\nis_a:\s*(GO:\d+)/g){
    my $parent_id = $1;
    $go_id2subtypes{$parent_id} = [] unless exists $go_id2subtypes{$parent_id};
    push @{$go_id2subtypes{$parent_id}}, $id;
  }
  if(exists $problematic_terms{$id}){
    if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
      my $match = $1;
      $match =~ tr/\t\n/  /;
      $match =~ s/ {2,}/ /g;
      if(not exists $matched_go_ids{$id}){
        $matched_go_ids{$id} = $match;
      }
      elsif($matched_go_ids{$id} !~ /$match/){
        $matched_go_ids{$id} .= "; $match";
      }
    }
  }
  elsif(/\b($query)/o){
    my $match = $1;
    $match =~ tr/\t\n/  /;
    $match =~ s/ {2,}/ /g;
    if(not exists $matched_go_ids{$id}){
      $matched_go_ids{$id} = $match;
    }
    elsif($matched_go_ids{$id} !~ /$match/){
      $matched_go_ids{$id} .= "; $match";
    }
    #print STDERR "Found match $match for $_\n";
  }
}
close(OBO);
#print STDERR "Found ", scalar(keys %matched_go_ids), "/$record_count go ontology terms matching the query\n";

open(OUT, ">$out_file")
  or die "Cannot open $out_file for writing: $!\n";

# Implements term subsumption
my @matched_go_ids = keys %matched_go_ids;
for(my $i = 0; $i <= $#matched_go_ids; $i++){
  my $go_id = $matched_go_ids[$i];
  next unless exists $go_id2subtypes{$go_id};
  for my $sub_type_id (@{$go_id2subtypes{$go_id}}){
    if(not exists $matched_go_ids{$sub_type_id}){
      $matched_go_ids{$sub_type_id} = $matched_go_ids{$go_id};
      push @matched_go_ids, $sub_type_id;
    }
  }
}

$/="\n"; # record separator
my %gene2go_ids;
opendir(GOADIR, $go_dir)
  or die "Cannot read directory $go_dir: $!\n";
while($_ = readdir(GOADIR)){
  next if /^\./; # hidden
  next unless /\.goa$/; # not a goa formatted file
  my $goa_file = $_;
  open(GOA, "$go_dir/$goa_file")
    or die "Cannot open $go_dir/$goa_file for reading: $!\n";
  #print STDERR "Processing file $goa_file\n";
  # example line:
  # UniProtKB	A0ASJ9			GO:0001664	GO_REF:0000033	ISS	PANTHER:PTN000026392	F	G protein alpha subunit AgGq6	A0ASJ9_ANOGA	protein	taxon:7165	20110125	RefGenome
  while(<GOA>){
    next if /^!/;  # comment
    chomp;
    my @F = split /\t/, $_;
    next unless $F[2] and $#F > 3; # does it have the gene name and go id fields?
    my $genename = uc($F[2]); # standardize gene names to upper case
    $genename =~ s/-//g;
    my $go_id = $F[4];
    $gene2go_ids{$genename} = [] unless exists $gene2go_ids{$genename};
    push @{$gene2go_ids{$genename}}, $go_id;
  }
  close(GOA);
}
close(GOADIR);
#print STDERR "Found ", scalar(keys %gene2go_ids), " total genes\n";

# remove genes if they don't have a matching go term
for my $genename (keys %gene2go_ids){
  my $keep = 0;
  for my $go_id (@{$gene2go_ids{$genename}}){
    if(exists $matched_go_ids{$go_id}){
      $keep = 1;
      last;
    }
  }
  delete $gene2go_ids{$genename} unless $keep;
}
#print STDERR "Found ", scalar(keys %gene2go_ids), " genes with gene ontology terms matching the query\n" unless $quiet;

$/ = "\n"; # one line at, a time from the HGVS file please!
open(HGVS, $hgvs_file)
  or die "Cannot open $hgvs_file for reading: $!\n";
my $header = <HGVS>;
chomp $header;
my @header_columns = split /\t/, $header;
my $gene_name_column;
for(my $i = 0; $i <= $#header_columns; $i++){
  if($header_columns[$i] eq "Gene Name"){
    $gene_name_column = $i;
  }
}
if(not defined $gene_name_column){
  die "Could not find 'Gene Name' column in the input header, aborting\n";
}

print OUT "$header\tQuickGO Gene Ontology Terms (matching $orig_query)\tQuickGO Gene Ontology Terms (other)\n";
# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target go term list
while(<HGVS>){
  chomp;
  my @F = split /\t/, $_, -1;
  my (@target_gos, @other_gos);
  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
    next unless exists $gene2go_ids{$gene_name};
    for my $id (@{$gene2go_ids{$gene_name}}){
      if(exists $matched_go_ids{$id}){
        push @target_gos, $go_id2name{$id}."($matched_go_ids{$id})";
      }
      else{
        push @other_gos, $go_id2name{$id};
      }
    }
  }
  if(@target_gos){
    my (%t,%o);
    # print unique terms
    print OUT join("\t", @F, join("; ", sort grep {not $t{$_}++} @target_gos), join("; ", sort grep {not $o{$_}++} @other_gos)), "\n";
  }
  else{
    print OUT join("\t", @F, "", ""), "\n";
  }
}