annotate filter_by_human_phenotype_ontology_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 my $quiet = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 if(@ARGV and $ARGV[0] =~ /^-q/){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 $quiet = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 @ARGV == 4 or die "Usage: $0 [-q(uiet)] <human_phenotype_ontology_dir> <hgvs_annotated.txt> <output.txt> <query>\n",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 "Where query has the format \"this or that\", \"this and that\", etc.\n",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 "Human phenotype files are available from http://compbio.charite.de/svn/hpo/trunk/src/ontology/human-phenotype-ontology.obo and genes_to_phenotype.txt\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 my $hpo_dir = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 my $obo_file = "$hpo_dir/human-phenotype-ontology.obo";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my $gene_pheno_file = "$hpo_dir/genes_to_phenotype.txt";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my $hgvs_file = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $out_file = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my $query = shift @ARGV;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 # Below is a list of ontology terms with definitions including hyponyms and meronyms, so only match text on title so as not to get too many false positives
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 my %problematic = qw(HP:0003271 Visceromegaly
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 HP:0000246 Sinusitis);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 # Ontology terms with HPO in the free text, leading to nasty overmatching when searching for gene HPO
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my %self_referencing = (
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 "HP:0000118" => "Phenotypic abnormality",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 "HP:0000455" => "Broad nasal tip",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 "HP:0000968" => "Ectodermal dysplasia",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 "HP:0002652" => "Skeletal dysplasia",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 "HP:0003812" => "Phenotypic variability",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 "HP:0003828" => "Variable expressivity",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 "HP:0003829" => "Incomplete penetrance",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 "HP:0004472" => "Mandibular hyperostosis",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 "HP:0004495" => "Thin anteverted nares",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 "HP:0005286" => "Hypoplastic, notched nares",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 "HP:0005321" => "Mandibulofacial dysostosis",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 "HP:0005871" => "Metaphyseal chondrodysplasia",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 "HP:0007589" => "Aplasia cutis congenita on trunk or limbs",
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 "HP:0007819" => "Presenile cataracts");
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 # Ignore metadata field words, so we can properly match gene names like HPO :-)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 my %stop_words = ("name:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 "HPO:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 "alt_id:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 "def:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 "synonym:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 "EXACT" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 "xref:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 "UMLS:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 "is_a:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 "created_by:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 "comment:" => 1,
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 "creation_date:" => 1);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 # convert the query to a regex
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 my $orig_query = $query;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 my $and_query = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 $query =~ s/\(\s*.+?\s*\)/my $s=$&;$s=~s(\s+or\s+)(|)g; $s/eg;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 if($query =~ s/(\S+)\s+and\s+(\S+)/(?:$1.*?$2|$2.*?$1)/gi){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 $and_query = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 $query =~ s/\s+or\s+/|/gi;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 $query =~ s/\b([a-z])([a-z]+\b)/"[".uc($1)."$1]$2"/eg; # allow title case match in regex for letter only lower case words, otherwise make case sensitive as assuming gene name
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 #print STDERR "Query regex is $query\n" unless $quiet;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 open(OBO, $obo_file)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 or die "Cannot open $obo_file for reading: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 my %matched_pheno_ids;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 my %pheno_id2subtypes;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 my %pheno_id2name;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 my $record_count;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 $/ = "\n[Term]\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 <OBO>; # chuck header
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 while(<OBO>){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 next unless /^id:\s*(HP:\d+)/s;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 my $id = $1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 next unless /\nname:\s*(.+?)\s*\n/s;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 my $name = $1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 $pheno_id2name{$id} = $name;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 $record_count++;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 while(/\nis_a:\s*(HP:\d+)/g){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 my $parent_id = $1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 $pheno_id2subtypes{$parent_id} = [] unless exists $pheno_id2subtypes{$parent_id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 push @{$pheno_id2subtypes{$parent_id}}, $id;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 s/(UMLS:\S+\s+")(.+?)(?=")/$1.lc($2)/eg;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 if(exists $problematic{$id}){ # for overmatching terms due to their descriptions (hyponyms and meronyms included in
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 # parent entry), only match the title to not generate too many false poositives
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 while($name =~ /\b($query)(\S*?:?)/go){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 next if exists $stop_words{$1.$2};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 my $match = $1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 $match =~ tr/\t\n/ /;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 $match =~ s/\s{2,}/ /g;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 if(not exists $matched_pheno_ids{$id}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 $matched_pheno_ids{$id} = $match;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 elsif($matched_pheno_ids{$id} !~ /$match/){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 $matched_pheno_ids{$id} .= "; $match";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 else{ # normally, match anywhere in the entry
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 while(/\b($query)(\S*?:?)/go){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 next if defined $2 and exists $stop_words{$1.$2} or $1 eq "HPO" and $self_referencing{$id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 my $match = $1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 $match =~ tr/\t\n/ /;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 $match =~ s/\s{2,}/ /g;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 if(not exists $matched_pheno_ids{$id}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 $matched_pheno_ids{$id} = $match;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 elsif($matched_pheno_ids{$id} !~ /\Q$match\E/){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 $matched_pheno_ids{$id} .= "; $match";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 #print STDERR "Match $match for $_\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 close(OBO);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 #print STDERR "Found ", scalar(keys %matched_pheno_ids), "/$record_count phenotype ontology terms matching the query\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 # Implements term subsumption
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 my @matched_pheno_ids = keys %matched_pheno_ids;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 for(my $i = 0; $i <= $#matched_pheno_ids; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 my $pheno_id = $matched_pheno_ids[$i];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 next unless exists $pheno_id2subtypes{$pheno_id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 for my $sub_type_id (@{$pheno_id2subtypes{$pheno_id}}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 if(not exists $matched_pheno_ids{$sub_type_id}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 $matched_pheno_ids{$sub_type_id} = $matched_pheno_ids{$pheno_id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 push @matched_pheno_ids, $sub_type_id;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 $/="\n"; # record separator
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 my %gene2pheno_ids;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 # Format: entrez-gene-id<tab>entrez-gene-symbol<tab>HPO-Term-Name<tab>HPO-Term-ID
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 open(PHENO, $gene_pheno_file)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 or die "Cannot open $gene_pheno_file for reading: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 while(<PHENO>){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 chomp;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 my @F = split /\t/, $_;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 next unless $#F > 2; # does it have the phenotype id field?
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 my $gene = $F[1];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 my $pheno_id = $F[3];
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 $gene2pheno_ids{$gene} = [] unless exists $gene2pheno_ids{$gene};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 push @{$gene2pheno_ids{$gene}}, $pheno_id;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 # remove genes if they don't have a matching phenotype
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 for my $gene (keys %gene2pheno_ids){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 my $keep = 0;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 for my $pheno_id (@{$gene2pheno_ids{$gene}}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 if(exists $matched_pheno_ids{$pheno_id}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 $keep = 1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 last;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 delete $gene2pheno_ids{$gene} unless $keep;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 #print STDERR "Found ", scalar(keys %gene2pheno_ids), " genes with human phenotype ontology terms matching the query\n" unless $quiet;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 $/ = "\n"; # one line at, a time from the HGVS file please!
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 open(HGVS, $hgvs_file)
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 or die "Cannot open $hgvs_file for reading: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 my $header = <HGVS>;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 chomp $header;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 my @header_columns = split /\t/, $header;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 my $gene_name_column;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 for(my $i = 0; $i <= $#header_columns; $i++){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 if($header_columns[$i] eq "Gene Name"){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 $gene_name_column = $i;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 if(not defined $gene_name_column){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 die "Could not find 'Gene Name' column in the input header, aborting\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 open(OUT, ">$out_file")
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 or die "Cannot open $out_file for writing: $!\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 print OUT "$header\tHuman Phenotypes (matching $orig_query)\tHuman Phenotypes (other)\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 # Check if any of the variants in the annotated HGVS table are in knockout genes matching the target phenotypes list
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 while(<HGVS>){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 chomp;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 my @F = split /\t/, $_, -1;
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 my (@target_phenos, @other_phenos);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 next unless exists $gene2pheno_ids{$gene_name};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 for my $id (@{$gene2pheno_ids{$gene_name}}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 next unless exists $pheno_id2name{$id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 if(exists $matched_pheno_ids{$id}){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 push @target_phenos, $pheno_id2name{$id}."($matched_pheno_ids{$id})";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 else{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 push @other_phenos, $pheno_id2name{$id};
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 if(@target_phenos){
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 print OUT join("\t", @F, join("; ", @target_phenos), join("; ", @other_phenos)), "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 else{
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 print OUT join("\t", @F, "", ""), "\n";
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 }
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 close(OUT);
6411ca16916e initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 close(HGVS);