0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 my $quiet = 0;
|
|
7 if(@ARGV and $ARGV[0] =~ /^-q/){
|
|
8 $quiet = 1;
|
|
9 shift @ARGV;
|
|
10 }
|
|
11
|
|
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <MGI knockout pheno data dir> <hgvs_annotated.txt> <output.txt> <query>\n",
|
|
13 "Where query has the format \"this or that\", \"this and that\", etc.\n",
|
|
14 "Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n";
|
|
15
|
|
16 my $mgi_dir = shift @ARGV;
|
|
17 my $obo_file = "$mgi_dir/MPheno_OBO.ontology";
|
|
18 my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt";
|
|
19 my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt";
|
|
20 my $hgvs_file = shift @ARGV;
|
|
21 my $out_file = shift @ARGV;
|
|
22 my $query = shift @ARGV;
|
|
23
|
|
24 #$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals
|
|
25 my %problematic_terms = ();
|
|
26
|
|
27 # convert the query to a regex
|
|
28 my $orig_query = $query;
|
|
29 my $and_query = 0;
|
|
30 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
|
|
31 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
|
|
32 $and_query = 1;
|
|
33 }
|
|
34 $query =~ s/\s+or\s+/|/gi;
|
|
35 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
|
|
36 #print STDERR "Query regex is $query\n" unless $quiet;
|
|
37
|
|
38 open(OBO, $obo_file)
|
|
39 or die "Cannot open $obo_file for reading: $!\n";
|
|
40 my %matched_pheno_ids;
|
|
41 my %pheno_id2subtypes;
|
|
42 my %pheno_id2name;
|
|
43 my $record_count;
|
|
44 $/ = "\n[Term]\n";
|
|
45 <OBO>; # chuck header
|
|
46 while(<OBO>){
|
|
47 next unless /^id:\s*(MP:\d+)/s;
|
|
48 my $id = $1;
|
|
49 next unless /\nname:\s*(.+?)\s*\n/s;
|
|
50 my $name = $1;
|
|
51 $pheno_id2name{$id} = $name;
|
|
52 $record_count++;
|
|
53 while(/\nis_a:\s*(MP:\d+)/g){
|
|
54 my $parent_id = $1;
|
|
55 $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
|
|
56 push @{$pheno_id2subtypes{$parent_id}}, $id;
|
|
57 }
|
|
58 if(exists $problematic_terms{$id}){
|
|
59 if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text
|
|
60 my $match = $1;
|
|
61 $match =~ tr/\t\n/ /;
|
|
62 $match =~ s/ {2,}/ /g;
|
|
63 if(not exists $matched_pheno_ids{$id}){
|
|
64 $matched_pheno_ids{$id} = $match;
|
|
65 }
|
|
66 elsif($matched_pheno_ids{$id} !~ /$match/){
|
|
67 $matched_pheno_ids{$id} .= "; $match";
|
|
68 }
|
|
69 }
|
|
70 }
|
|
71 elsif(/\b($query)/o){
|
|
72 my $match = $1;
|
|
73 $match =~ tr/\t\n/ /;
|
|
74 $match =~ s/ {2,}/ /g;
|
|
75 if(not exists $matched_pheno_ids{$id}){
|
|
76 $matched_pheno_ids{$id} = $match;
|
|
77 }
|
|
78 elsif($matched_pheno_ids{$id} !~ /$match/){
|
|
79 $matched_pheno_ids{$id} .= "; $match";
|
|
80 }
|
|
81 }
|
|
82 }
|
|
83 close(OBO);
|
|
84 #print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";
|
|
85
|
|
86 open(OUT, ">$out_file")
|
|
87 or die "Cannot open $out_file for writing: $!\n";
|
|
88
|
|
89 # Implements term subsumption
|
|
90 my @matched_pheno_ids = keys %matched_pheno_ids;
|
|
91 for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
|
|
92 my $pheno_id = $matched_pheno_ids[$i];
|
|
93 next unless exists $pheno_id2subtypes{$pheno_id};
|
|
94 for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
|
|
95 if(not exists $matched_pheno_ids{$sub_type_id}){
|
|
96 $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
|
|
97 push @matched_pheno_ids, $sub_type_id;
|
|
98 }
|
|
99 }
|
|
100 }
|
|
101
|
|
102 $/="\n"; # record separator
|
|
103 my %human2mouse;
|
|
104 # example line:
|
|
105 # WNT3A 89780 Wnt3a MGI:98956 MP:0003012 MP:0003631 ... MP:0010768
|
|
106 open(HUMAN2MOUSE, $human_mouse_file)
|
|
107 or die "Cannot open $human_mouse_file for reading: $!\n";
|
|
108 while(<HUMAN2MOUSE>){
|
|
109 my @F = split /\t/, $_;
|
|
110 $human2mouse{$F[0]} = $F[2];
|
|
111 }
|
|
112 close(HUMAN2MOUSE);
|
|
113
|
|
114 my %gene2pheno_ids;
|
|
115 # example line:
|
|
116 # Rbpj<tm1Kyo>/Rbpj<tm1Kyo> Rbpj<tm1Kyo> involves: 129S2/SvPas * C57BL/6 MP:0001614 15466160 MGI:96522
|
|
117 open(PHENO, $geno_pheno_file)
|
|
118 or die "Cannot open $geno_pheno_file for reading: $!\n";
|
|
119 while(<PHENO>){
|
|
120 chomp;
|
|
121 my @F = split /\t/, $_;
|
|
122 next unless $#F > 2; # does it have the phenotype id field?
|
|
123 my $knockout = $F[0];
|
|
124 next if $knockout =~ /,/; # ignore double knockouts etc.
|
|
125 $knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description
|
|
126 my $pheno_id = $F[3];
|
|
127 $gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout};
|
|
128 push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]];
|
|
129 }
|
|
130
|
|
131 # remove genes if they don't have a matching phenotype
|
|
132 for my $gene (keys %gene2pheno_ids){
|
|
133 my $keep = 0;
|
|
134 for my $pheno_id (@{$gene2pheno_ids{$gene}}){
|
|
135 if(exists $matched_pheno_ids{$pheno_id->[0]}){
|
|
136 $keep = 1;
|
|
137 last;
|
|
138 }
|
|
139 }
|
|
140 delete $gene2pheno_ids{$gene} unless $keep;
|
|
141 }
|
|
142 #print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet;
|
|
143
|
|
144 $/ = "\n"; # one line at, a time from the HGVS file please!
|
|
145 open(HGVS, $hgvs_file)
|
|
146 or die "Cannot open $hgvs_file for reading: $!\n";
|
|
147 my $header = <HGVS>;
|
|
148 chomp $header;
|
|
149 my @header_columns = split /\t/, $header;
|
|
150 my $gene_name_column;
|
|
151 for(my $i = 0; $i <= $#header_columns; $i++){
|
|
152 if($header_columns[$i] eq "Gene Name"){
|
|
153 $gene_name_column = $i;
|
|
154 }
|
|
155 }
|
|
156 if(not defined $gene_name_column){
|
|
157 die "Could not find 'Gene Name' column in the input header, aborting\n";
|
|
158 }
|
|
159 print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n";
|
|
160
|
|
161 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
|
|
162 while(<HGVS>){
|
|
163 chomp;
|
|
164 my @F = split /\t/, $_, -1;
|
|
165 my (%target_phenos, %other_phenos);
|
|
166 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
|
|
167 next unless exists $human2mouse{$gene_name};
|
|
168 next unless exists $gene2pheno_ids{$human2mouse{$gene_name}};
|
|
169 for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){
|
|
170 my ($id, $pmid) = @$pheno_id;
|
|
171 if(exists $matched_pheno_ids{$id}){
|
|
172 $target_phenos{$pmid} = [] unless exists $target_phenos{$pmid};
|
|
173 push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
|
|
174 }
|
|
175 else{
|
|
176 $other_phenos{$pmid} = [] unless exists $other_phenos{$pmid};
|
|
177 push @{$other_phenos{$pmid}}, $pheno_id2name{$id};
|
|
178 }
|
|
179 }
|
|
180 }
|
|
181 if(%target_phenos){
|
|
182 print OUT join("\t", @F);
|
|
183 print OUT "\t";
|
|
184 my $count = 0;
|
|
185 for my $pmid (keys %target_phenos){
|
|
186 print OUT " // " if $count++;
|
|
187 print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}});
|
|
188 }
|
|
189 print OUT "\t";
|
|
190 $count = 0;
|
|
191 for my $pmid (keys %other_phenos){
|
|
192 print OUT " // " if $count++;
|
|
193 print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}});
|
|
194 }
|
|
195 print OUT "\n";
|
|
196 }
|
|
197 else{
|
|
198 print OUT join("\t", @F, "", ""), "\n";
|
|
199 }
|
|
200 }
|
|
201 close(HGVS);
|
|
202 close(OUT);
|