associate_phenotypes: filter_by_mouse_knockout

comparison filter_by_mouse_knockout_pipe @ 0:6411ca16916e default tip

initial commit

author	Yusuf Ali <ali@yusuf.email>
date	Wed, 25 Mar 2015 13:23:29 -0600
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:6411ca16916e
+#!/usr/bin/env perl
+use strict;
+use warnings;
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+$quiet = 1;
+shift @ARGV;
+}
+@ARGV == 4 or die "Usage: $0 [-q(uiet)] <MGI knockout pheno data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
+"Where query has the format \"this or that\", \"this and that\", etc.\n",
+"Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n";
+my $mgi_dir = shift @ARGV;
+my $obo_file = "$mgi_dir/MPheno_OBO.ontology";
+my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt";
+my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt";
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $query = shift @ARGV;
+#$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals
+my %problematic_terms = ();
+# convert the query to a regex
+my $orig_query = $query;
+my $and_query = 0;
+$query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
+if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
+$and_query = 1;
+}
+$query =~ s/\s+or\s+/|/gi;
+$query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
+#print STDERR "Query regex is $query\n" unless $quiet;
+open(OBO, $obo_file)
+or die "Cannot open $obo_file for reading: $!\n";
+my %matched_pheno_ids;
+my %pheno_id2subtypes;
+my %pheno_id2name;
+my $record_count;
+$/ = "\n[Term]\n";
+<OBO>; # chuck header
+while(<OBO>){
+next unless /^id:\s*(MP:\d+)/s;
+my $id = $1;
+next unless /\nname:\s*(.+?)\s*\n/s;
+my $name = $1;
+$pheno_id2name{$id} = $name;
+$record_count++;
+while(/\nis_a:\s*(MP:\d+)/g){
+my $parent_id = $1;
+$pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
+push @{$pheno_id2subtypes{$parent_id}}, $id;
+}
+if(exists $problematic_terms{$id}){
+if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
+my $match = $1;
+$match =~ tr/\t\n/  /;
+$match =~ s/ {2,}/ /g;
+if(not exists $matched_pheno_ids{$id}){
+$matched_pheno_ids{$id} = $match;
+}
+elsif($matched_pheno_ids{$id} !~ /$match/){
+$matched_pheno_ids{$id} .= "; $match";
+}
+}
+}
+elsif(/\b($query)/o){
+my $match = $1;
+$match =~ tr/\t\n/  /;
+$match =~ s/ {2,}/ /g;
+if(not exists $matched_pheno_ids{$id}){
+$matched_pheno_ids{$id} = $match;
+}
+elsif($matched_pheno_ids{$id} !~ /$match/){
+$matched_pheno_ids{$id} .= "; $match";
+}
+}
+}
+close(OBO);
+#print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";
+open(OUT, ">$out_file")
+or die "Cannot open $out_file for writing: $!\n";
+# Implements term subsumption
+my @matched_pheno_ids = keys %matched_pheno_ids;
+for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
+my $pheno_id = $matched_pheno_ids[$i];
+next unless exists $pheno_id2subtypes{$pheno_id};
+for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
+if(not exists $matched_pheno_ids{$sub_type_id}){
+$matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
+push @matched_pheno_ids, $sub_type_id;
+}
+}
+}
+$/="\n"; # record separator
+my %human2mouse;
+# example line:
+# WNT3A	  89780	Wnt3a	  MGI:98956	  MP:0003012 MP:0003631 ... MP:0010768
+open(HUMAN2MOUSE, $human_mouse_file)
+or die "Cannot open $human_mouse_file for reading: $!\n";
+while(<HUMAN2MOUSE>){
+my @F = split /\t/, $_;
+$human2mouse{$F[0]} = $F[2];
+}
+close(HUMAN2MOUSE);
+my %gene2pheno_ids;
+# example line:
+# Rbpj<tm1Kyo>/Rbpj<tm1Kyo>	Rbpj<tm1Kyo>	involves: 129S2/SvPas * C57BL/6	MP:0001614	15466160	MGI:96522
+open(PHENO, $geno_pheno_file)
+or die "Cannot open $geno_pheno_file for reading: $!\n";
+while(<PHENO>){
+chomp;
+my @F = split /\t/, $_;
+next unless $#F > 2; # does it have the phenotype id field?
+my $knockout = $F[0];
+next if $knockout =~ /,/; # ignore double knockouts etc.
+$knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description
+my $pheno_id = $F[3];
+$gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout};
+push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]];
+}
+# remove genes if they don't have a matching phenotype
+for my $gene (keys %gene2pheno_ids){
+my $keep = 0;
+for my $pheno_id (@{$gene2pheno_ids{$gene}}){
+if(exists $matched_pheno_ids{$pheno_id->[0]}){
+$keep = 1;
+last;
+}
+}
+delete $gene2pheno_ids{$gene} unless $keep;
+}
+#print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet;
+$/ = "\n"; # one line at, a time from the HGVS file please!
+open(HGVS, $hgvs_file)
+or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my $gene_name_column;
+for(my $i = 0; $i <= $#header_columns; $i++){
+if($header_columns[$i] eq "Gene Name"){
+$gene_name_column = $i;
+}
+}
+if(not defined $gene_name_column){
+die "Could not find 'Gene Name' column in the input header, aborting\n";
+}
+print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n";
+# Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
+while(<HGVS>){
+chomp;
+my @F = split /\t/, $_, -1;
+my (%target_phenos, %other_phenos);
+for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+next unless exists $human2mouse{$gene_name};
+next unless exists $gene2pheno_ids{$human2mouse{$gene_name}};
+for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){
+my ($id, $pmid) = @$pheno_id;
+if(exists $matched_pheno_ids{$id}){
+$target_phenos{$pmid} = [] unless exists $target_phenos{$pmid};
+push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
+}
+else{
+$other_phenos{$pmid} = [] unless exists $other_phenos{$pmid};
+push @{$other_phenos{$pmid}}, $pheno_id2name{$id};
+}
+}
+}
+if(%target_phenos){
+print OUT join("\t", @F);
+print OUT "\t";
+my $count = 0;
+for my $pmid (keys %target_phenos){
+print OUT " // " if $count++;
+print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}});
+}
+print OUT "\t";
+$count = 0;
+for my $pmid (keys %other_phenos){
+print OUT " // " if $count++;
+print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}});
+}
+print OUT "\n";
+}
+else{
+print OUT join("\t", @F, "", ""), "\n";
+}
+}
+close(HGVS);
+close(OUT);

Mercurial > repos > yusuf > associate_phenotypes

comparison filter_by_mouse_knockout_pipe @ 0:6411ca16916e default tip