Mercurial > repos > yusuf > associate_phenotypes
view filter_by_human_phenotype_ontology_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; my $quiet = 0; if(@ARGV and $ARGV[0] =~ /^-q/){ $quiet = 1; shift @ARGV; } @ARGV == 4 or die "Usage: $0 [-q(uiet)] <human_phenotype_ontology_dir> <hgvs_annotated.txt> <output.txt> <query>\n", "Where query has the format \"this or that\", \"this and that\", etc.\n", "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n"; my $hpo_dir = shift @ARGV; my $obo_file = "$hpo_dir/human-phenotype-ontology.obo"; my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt"; my $hgvs_file = shift @ARGV; my $out_file = shift @ARGV; my $query = shift @ARGV; # Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives my %problematic = qw(HP:0003271 Visceromegaly HP:0000246 Sinusitis); # Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO my %self_referencing = ( "HP:0000118" => "Phenotypic abnormality", "HP:0000455" => "Broad nasal tip", "HP:0000968" => "Ectodermal dysplasia", "HP:0002652" => "Skeletal dysplasia", "HP:0003812" => "Phenotypic variability", "HP:0003828" => "Variable expressivity", "HP:0003829" => "Incomplete penetrance", "HP:0004472" => "Mandibular hyperostosis", "HP:0004495" => "Thin anteverted nares", "HP:0005286" => "Hypoplastic, notched nares", "HP:0005321" => "Mandibulofacial dysostosis", "HP:0005871" => "Metaphyseal chondrodysplasia", "HP:0007589" => "Aplasia cutis congenita on trunk or limbs", "HP:0007819" => "Presenile cataracts"); # Ignore metadata field words, so we can properly match gene names like HPO :-) my %stop_words = ("name:" => 1, "HPO:" => 1, "alt_id:" => 1, "def:" => 1, "synonym:" => 1, "EXACT" => 1, "xref:" => 1, "UMLS:" => 1, "is_a:" => 1, "created_by:" => 1, "comment:" => 1, "creation_date:" => 1); # convert the query to a regex my $orig_query = $query; my $and_query = 0; $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ $and_query = 1; } $query =~ s/\s+or\s+/|/gi; $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name #print STDERR "Query regex is $query\n" unless $quiet; open(OBO, $obo_file) or die "Cannot open $obo_file for reading: $!\n"; my %matched_pheno_ids; my %pheno_id2subtypes; my %pheno_id2name; my $record_count; $/ = "\n[Term]\n"; <OBO>; # chuck header while(<OBO>){ next unless /^id:\s*(HP:\d+)/s; my $id = $1; next unless /\nname:\s*(.+?)\s*\n/s; my $name = $1; $pheno_id2name{$id} = $name; $record_count++; while(/\nis_a:\s*(HP:\d+)/g){ my $parent_id = $1; $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; push @{$pheno_id2subtypes{$parent_id}}, $id; } s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg; if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in # parent entry), only match the title to not generate too many false poositives while($name =~ /\b($query)(\S*?:?)/go){ next if exists $stop_words{$1.$2}; my $match = $1; $match =~ tr/\t\n/ /; $match =~ s/\s{2,}/ /g; if(not exists $matched_pheno_ids{$id}){ $matched_pheno_ids{$id} = $match; } elsif($matched_pheno_ids{$id} !~ /$match/){ $matched_pheno_ids{$id} .= "; $match"; } } } else{ # normally, match anywhere in the entry while(/\b($query)(\S*?:?)/go){ next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id}; my $match = $1; $match =~ tr/\t\n/ /; $match =~ s/\s{2,}/ /g; if(not exists $matched_pheno_ids{$id}){ $matched_pheno_ids{$id} = $match; } elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){ $matched_pheno_ids{$id} .= "; $match"; } #print STDERR "Match $match for $_\n"; } } } close(OBO); #print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; # Implements term subsumption my @matched_pheno_ids = keys %matched_pheno_ids; for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ my $pheno_id = $matched_pheno_ids[$i]; next unless exists $pheno_id2subtypes{$pheno_id}; for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ if(not exists $matched_pheno_ids{$sub_type_id}){ $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; push @matched_pheno_ids, $sub_type_id; } } } $/="\n"; # record separator my %gene2pheno_ids; # Format: entrez-gene-id<tab>entrez-gene-symbol<tab>HPO-Term-Name<tab>HPO-Term-ID open(PHENO, $gene_pheno_file) or die "Cannot open $gene_pheno_file for reading: $!\n"; while(<PHENO>){ chomp; my @F = split /\t/, $_; next unless $#F > 2; # does it have the phenotype id field? my $gene = $F[1]; my $pheno_id = $F[3]; $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene}; push @{$gene2pheno_ids{$gene}}, $pheno_id; } # remove genes if they don't have a matching phenotype for my $gene (keys %gene2pheno_ids){ my $keep = 0; for my $pheno_id (@{$gene2pheno_ids{$gene}}){ if(exists $matched_pheno_ids{$pheno_id}){ $keep = 1; last; } } delete $gene2pheno_ids{$gene} unless $keep; } #print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet; $/ = "\n"; # one line at, a time from the HGVS file please! open(HGVS, $hgvs_file) or die "Cannot open $hgvs_file for reading: $!\n"; my $header = <HGVS>; chomp $header; my @header_columns = split /\t/, $header; my $gene_name_column; for(my $i = 0; $i <= $#header_columns; $i++){ if($header_columns[$i] eq "Gene Name"){ $gene_name_column = $i; } } if(not defined $gene_name_column){ die "Could not find 'Gene Name' column in the input header, aborting\n"; } open(OUT, ">$out_file") or die "Cannot open $out_file for writing: $!\n"; print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n"; # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list while(<HGVS>){ chomp; my @F = split /\t/, $_, -1; my (@target_phenos, @other_phenos); for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ next unless exists $gene2pheno_ids{$gene_name}; for my $id (@{$gene2pheno_ids{$gene_name}}){ next unless exists $pheno_id2name{$id}; if(exists $matched_pheno_ids{$id}){ push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; } else{ push @other_phenos, $pheno_id2name{$id}; } } } if(@target_phenos){ print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n"; } else{ print OUT join("\t", @F, "", ""), "\n"; } } close(OUT); close(HGVS);