Mercurial > repos > yusuf > associate_phenotypes
comparison filter_by_mouse_knockout_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6411ca16916e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 my $quiet = 0; | |
7 if(@ARGV and $ARGV[0] =~ /^-q/){ | |
8 $quiet = 1; | |
9 shift @ARGV; | |
10 } | |
11 | |
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <MGI knockout pheno data dir> <hgvs_annotated.txt> <output.txt> <query>\n", | |
13 "Where query has the format \"this or that\", \"this and that\", etc.\n", | |
14 "Knockout files are available from ftp://ftp.informatics.jax.org/pub/reports/MPheno_OBO.ontology and ftp://ftp.informatics.jax.org/pub/reports/HMD_HumanPhenotype.rpt\n"; | |
15 | |
16 my $mgi_dir = shift @ARGV; | |
17 my $obo_file = "$mgi_dir/MPheno_OBO.ontology"; | |
18 my $human_mouse_file = "$mgi_dir/HMD_HumanPhenotype.rpt"; | |
19 my $geno_pheno_file = "$mgi_dir/MGI_PhenoGenoMP.rpt"; | |
20 my $hgvs_file = shift @ARGV; | |
21 my $out_file = shift @ARGV; | |
22 my $query = shift @ARGV; | |
23 | |
24 #$query = quotemeta($query); # in case there are meta characters in the query, treat them as literals | |
25 my %problematic_terms = (); | |
26 | |
27 # convert the query to a regex | |
28 my $orig_query = $query; | |
29 my $and_query = 0; | |
30 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg; | |
31 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){ | |
32 $and_query = 1; | |
33 } | |
34 $query =~ s/\s+or\s+/|/gi; | |
35 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name | |
36 #print STDERR "Query regex is $query\n" unless $quiet; | |
37 | |
38 open(OBO, $obo_file) | |
39 or die "Cannot open $obo_file for reading: $!\n"; | |
40 my %matched_pheno_ids; | |
41 my %pheno_id2subtypes; | |
42 my %pheno_id2name; | |
43 my $record_count; | |
44 $/ = "\n[Term]\n"; | |
45 <OBO>; # chuck header | |
46 while(<OBO>){ | |
47 next unless /^id:\s*(MP:\d+)/s; | |
48 my $id = $1; | |
49 next unless /\nname:\s*(.+?)\s*\n/s; | |
50 my $name = $1; | |
51 $pheno_id2name{$id} = $name; | |
52 $record_count++; | |
53 while(/\nis_a:\s*(MP:\d+)/g){ | |
54 my $parent_id = $1; | |
55 $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id}; | |
56 push @{$pheno_id2subtypes{$parent_id}}, $id; | |
57 } | |
58 if(exists $problematic_terms{$id}){ | |
59 if($name =~ /\b($query)/o){ # strict matching of name only if an entry with problematic free text | |
60 my $match = $1; | |
61 $match =~ tr/\t\n/ /; | |
62 $match =~ s/ {2,}/ /g; | |
63 if(not exists $matched_pheno_ids{$id}){ | |
64 $matched_pheno_ids{$id} = $match; | |
65 } | |
66 elsif($matched_pheno_ids{$id} !~ /$match/){ | |
67 $matched_pheno_ids{$id} .= "; $match"; | |
68 } | |
69 } | |
70 } | |
71 elsif(/\b($query)/o){ | |
72 my $match = $1; | |
73 $match =~ tr/\t\n/ /; | |
74 $match =~ s/ {2,}/ /g; | |
75 if(not exists $matched_pheno_ids{$id}){ | |
76 $matched_pheno_ids{$id} = $match; | |
77 } | |
78 elsif($matched_pheno_ids{$id} !~ /$match/){ | |
79 $matched_pheno_ids{$id} .= "; $match"; | |
80 } | |
81 } | |
82 } | |
83 close(OBO); | |
84 #print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n"; | |
85 | |
86 open(OUT, ">$out_file") | |
87 or die "Cannot open $out_file for writing: $!\n"; | |
88 | |
89 # Implements term subsumption | |
90 my @matched_pheno_ids = keys %matched_pheno_ids; | |
91 for(my $i = 0; $i <= $#matched_pheno_ids; $i++){ | |
92 my $pheno_id = $matched_pheno_ids[$i]; | |
93 next unless exists $pheno_id2subtypes{$pheno_id}; | |
94 for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){ | |
95 if(not exists $matched_pheno_ids{$sub_type_id}){ | |
96 $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id}; | |
97 push @matched_pheno_ids, $sub_type_id; | |
98 } | |
99 } | |
100 } | |
101 | |
102 $/="\n"; # record separator | |
103 my %human2mouse; | |
104 # example line: | |
105 # WNT3A 89780 Wnt3a MGI:98956 MP:0003012 MP:0003631 ... MP:0010768 | |
106 open(HUMAN2MOUSE, $human_mouse_file) | |
107 or die "Cannot open $human_mouse_file for reading: $!\n"; | |
108 while(<HUMAN2MOUSE>){ | |
109 my @F = split /\t/, $_; | |
110 $human2mouse{$F[0]} = $F[2]; | |
111 } | |
112 close(HUMAN2MOUSE); | |
113 | |
114 my %gene2pheno_ids; | |
115 # example line: | |
116 # Rbpj<tm1Kyo>/Rbpj<tm1Kyo> Rbpj<tm1Kyo> involves: 129S2/SvPas * C57BL/6 MP:0001614 15466160 MGI:96522 | |
117 open(PHENO, $geno_pheno_file) | |
118 or die "Cannot open $geno_pheno_file for reading: $!\n"; | |
119 while(<PHENO>){ | |
120 chomp; | |
121 my @F = split /\t/, $_; | |
122 next unless $#F > 2; # does it have the phenotype id field? | |
123 my $knockout = $F[0]; | |
124 next if $knockout =~ /,/; # ignore double knockouts etc. | |
125 $knockout =~ s/^(\S+?)<.*/$1/; # keep only first gene name bit of knockout description | |
126 my $pheno_id = $F[3]; | |
127 $gene2pheno_ids{$knockout} = [] unless exists $gene2pheno_ids{$knockout}; | |
128 push @{$gene2pheno_ids{$knockout}}, [$pheno_id,$F[4]]; | |
129 } | |
130 | |
131 # remove genes if they don't have a matching phenotype | |
132 for my $gene (keys %gene2pheno_ids){ | |
133 my $keep = 0; | |
134 for my $pheno_id (@{$gene2pheno_ids{$gene}}){ | |
135 if(exists $matched_pheno_ids{$pheno_id->[0]}){ | |
136 $keep = 1; | |
137 last; | |
138 } | |
139 } | |
140 delete $gene2pheno_ids{$gene} unless $keep; | |
141 } | |
142 #print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with knockout phenotype ontology terms matching the query\n" unless $quiet; | |
143 | |
144 $/ = "\n"; # one line at, a time from the HGVS file please! | |
145 open(HGVS, $hgvs_file) | |
146 or die "Cannot open $hgvs_file for reading: $!\n"; | |
147 my $header = <HGVS>; | |
148 chomp $header; | |
149 my @header_columns = split /\t/, $header; | |
150 my $gene_name_column; | |
151 for(my $i = 0; $i <= $#header_columns; $i++){ | |
152 if($header_columns[$i] eq "Gene Name"){ | |
153 $gene_name_column = $i; | |
154 } | |
155 } | |
156 if(not defined $gene_name_column){ | |
157 die "Could not find 'Gene Name' column in the input header, aborting\n"; | |
158 } | |
159 print OUT "$header\tMouse Knockout Phenotypes (matching $orig_query)\tMouse Phenotypes (other)\n"; | |
160 | |
161 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list | |
162 while(<HGVS>){ | |
163 chomp; | |
164 my @F = split /\t/, $_, -1; | |
165 my (%target_phenos, %other_phenos); | |
166 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){ | |
167 next unless exists $human2mouse{$gene_name}; | |
168 next unless exists $gene2pheno_ids{$human2mouse{$gene_name}}; | |
169 for my $pheno_id (@{$gene2pheno_ids{$human2mouse{$gene_name}}}){ | |
170 my ($id, $pmid) = @$pheno_id; | |
171 if(exists $matched_pheno_ids{$id}){ | |
172 $target_phenos{$pmid} = [] unless exists $target_phenos{$pmid}; | |
173 push @{$target_phenos{$pmid}}, $pheno_id2name{$id}."($matched_pheno_ids{$id})"; | |
174 } | |
175 else{ | |
176 $other_phenos{$pmid} = [] unless exists $other_phenos{$pmid}; | |
177 push @{$other_phenos{$pmid}}, $pheno_id2name{$id}; | |
178 } | |
179 } | |
180 } | |
181 if(%target_phenos){ | |
182 print OUT join("\t", @F); | |
183 print OUT "\t"; | |
184 my $count = 0; | |
185 for my $pmid (keys %target_phenos){ | |
186 print OUT " // " if $count++; | |
187 print OUT "PubMed $pmid: ", join("; ", @{$target_phenos{$pmid}}); | |
188 } | |
189 print OUT "\t"; | |
190 $count = 0; | |
191 for my $pmid (keys %other_phenos){ | |
192 print OUT " // " if $count++; | |
193 print OUT "PubMed $pmid: ", join("; ", @{$other_phenos{$pmid}}); | |
194 } | |
195 print OUT "\n"; | |
196 } | |
197 else{ | |
198 print OUT join("\t", @F, "", ""), "\n"; | |
199 } | |
200 } | |
201 close(HGVS); | |
202 close(OUT); |