Mercurial > repos > yusuf > associate_phenotypes
diff filter_by_mouse_knockout_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_mouse_knockout_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,202 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 4 or die "Usage: $0 [-q(uiet)] <MGI knockout pheno data dir> <hgvs_annotated.txt> <output.txt> <query>\n", + "Where query has the format \"this or that\", \"this and that\", etc.\n", + "Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n"; + +my $mgi_dir = shift @ARGV; +my $obo_file = "$mgi_dir/MPheno_OBO.ontology"; +my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt"; +my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt"; +my $hgvs_file = shift @ARGV; +my $out_file = shift @ARGV; +my $query = shift @ARGV; + +#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals +my %problematic_terms = (); + +# convert the query to a regex +my $orig_query = $query; +my $and_query = 0; +$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; +if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ + $and_query = 1; +} +$query =~ s/\s+or\s+/|/gi; +$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name +#print STDERR "Query regex is $query\n" unless $quiet; + +open(OBO, $obo_file) + or die "Cannot open $obo_file for reading: $!\n"; +my %matched_pheno_ids; +my %pheno_id2subtypes; +my %pheno_id2name; +my $record_count; +$/ = "\n[Term]\n"; +<OBO>; # chuck header +while(<OBO>){ + next unless /^id:\s*(MP:\d+)/s; + my $id = $1; + next unless /\nname:\s*(.+?)\s*\n/s; + my $name = $1; + $pheno_id2name{$id} = $name; + $record_count++; + while(/\nis_a:\s*(MP:\d+)/g){ + my $parent_id = $1; + $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; + push @{$pheno_id2subtypes{$parent_id}}, $id; + } + if(exists $problematic_terms{$id}){ + if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /$match/){ + $matched_pheno_ids{$id} .= "; $match"; + } + } + } + elsif(/\b($query)/o){ + my $match = $1; + $match =~ tr/\t\n/ /; + $match =~ s/ {2,}/ /g; + if(not exists $matched_pheno_ids{$id}){ + $matched_pheno_ids{$id} = $match; + } + elsif($matched_pheno_ids{$id} !~ /$match/){ + $matched_pheno_ids{$id} .= "; $match"; + } + } +} +close(OBO); +#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; + +open(OUT, ">$out_file") + or die "Cannot open $out_file for writing: $!\n"; + +# Implements term subsumption +my @matched_pheno_ids = keys %matched_pheno_ids; +for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ + my $pheno_id = $matched_pheno_ids[$i]; + next unless exists $pheno_id2subtypes{$pheno_id}; + for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ + if(not exists $matched_pheno_ids{$sub_type_id}){ + $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; + push @matched_pheno_ids, $sub_type_id; + } + } +} + +$/="\n"; # record separator +my %human2mouse; +# example line: +# WNT3A 89780 Wnt3a MGI:98956 MP:0003012 MP:0003631 ... MP:0010768 +open(HUMAN2MOUSE, $human_mouse_file) + or die "Cannot open $human_mouse_file for reading: $!\n"; +while(<HUMAN2MOUSE>){ + my @F = split /\t/, $_; + $human2mouse{$F[0]} = $F[2]; +} +close(HUMAN2MOUSE); + +my %gene2pheno_ids; +# example line: +# Rbpj<tm1Kyo>/Rbpj<tm1Kyo> Rbpj<tm1Kyo> involves: 129S2/SvPas * C57BL/6 MP:0001614 15466160 MGI:96522 +open(PHENO, $geno_pheno_file) + or die "Cannot open $geno_pheno_file for reading: $!\n"; +while(<PHENO>){ + chomp; + my @F = split /\t/, $_; + next unless $#F > 2; # does it have the phenotype id field? + my $knockout = $F[0]; + next if $knockout =~ /,/; # ignore double knockouts etc. + $knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description + my $pheno_id = $F[3]; + $gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout}; + push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]]; +} + +# remove genes if they don't have a matching phenotype +for my $gene (keys %gene2pheno_ids){ + my $keep = 0; + for my $pheno_id (@{$gene2pheno_ids{$gene}}){ + if(exists $matched_pheno_ids{$pheno_id->[0]}){ + $keep = 1; + last; + } + } + delete $gene2pheno_ids{$gene} unless $keep; +} +#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet; + +$/ = "\n"; # one line at, a time from the HGVS file please! +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +my $header = <HGVS>; +chomp $header; +my @header_columns = split /\t/, $header; +my $gene_name_column; +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "Gene Name"){ + $gene_name_column = $i; + } +} +if(not defined $gene_name_column){ + die "Could not find 'Gene Name' column in the input header, aborting\n"; +} +print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n"; + +# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list +while(<HGVS>){ + chomp; + my @F = split /\t/, $_, -1; + my (%target_phenos, %other_phenos); + for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ + next unless exists $human2mouse{$gene_name}; + next unless exists $gene2pheno_ids{$human2mouse{$gene_name}}; + for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){ + my ($id, $pmid) = @$pheno_id; + if(exists $matched_pheno_ids{$id}){ + $target_phenos{$pmid} = [] unless exists $target_phenos{$pmid}; + push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; + } + else{ + $other_phenos{$pmid} = [] unless exists $other_phenos{$pmid}; + push @{$other_phenos{$pmid}}, $pheno_id2name{$id}; + } + } + } + if(%target_phenos){ + print OUT join("\t", @F); + print OUT "\t"; + my $count = 0; + for my $pmid (keys %target_phenos){ + print OUT " // " if $count++; + print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}}); + } + print OUT "\t"; + $count = 0; + for my $pmid (keys %other_phenos){ + print OUT " // " if $count++; + print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}}); + } + print OUT "\n"; + } + else{ + print OUT join("\t", @F, "", ""), "\n"; + } +} +close(HGVS); +close(OUT);