Mercurial > repos > yusuf > associate_phenotypes
diff filter_by_susceptibility_loci_pipe @ 0:6411ca16916e default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:23:29 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_susceptibility_loci_pipe Wed Mar 25 13:23:29 2015 -0600 @@ -0,0 +1,174 @@ +#!/usr/bin/env perl + +# Reports SNPs associated with susceptibility loci determined by GWAS +use strict; +use warnings; + +@ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n"; +my $gwas_file = shift @ARGV; +my $hgvs_file = shift @ARGV; +my $outfile = shift @ARGV; +my $pheno_query = shift @ARGV; + +$pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace +my $s; +$pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg; +my @or_terms = split /\s+or\s+/i, $pheno_query; +for my $term (@or_terms){ + $term = and_query($term); # recursively convert Boolean AND to equivalent regex +} + +# Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease/Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95% CI (text) Platform [SNPs passing QC] CNV +open(GWAS, $gwas_file) + or die "Cannot open GWAS file for reading: $!\n"; +my $header = <GWAS>; +chomp $header; +my @columns = split /\t/, $header; +my $trait_column_index = -1; +my $study_column_index = -1; +my $chr_column_index = -1; +my $pos_column_index = -1; +my $allele_column_index = -1; +my $context_column_index = -1; +my $odds_column_index = -1; +my $pubmed_column_index = -1; +for(my $i = 0; $i < $#columns; $i++){ + if($columns[$i] eq "Disease/Trait"){ + $trait_column_index = $i; + } + elsif($columns[$i] eq "Study"){ + $study_column_index = $i; + } + elsif($columns[$i] eq "Chr_id"){ + $chr_column_index = $i; + } + elsif($columns[$i] eq "Chr_pos"){ + $pos_column_index = $i; + } + elsif($columns[$i] eq "Strongest SNP-Risk Allele"){ + $allele_column_index = $i; + } + elsif($columns[$i] eq "p-Value (text)"){ + $context_column_index = $i; + } + elsif($columns[$i] eq "OR or beta"){ + $odds_column_index = $i; + } + elsif($columns[$i] eq "PUBMEDID"){ + $pubmed_column_index = $i; + } +} +if($trait_column_index == -1){ + die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($study_column_index == -1){ + die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($chr_column_index == -1){ + die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($pos_column_index == -1){ + die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($allele_column_index == -1){ + die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($context_column_index == -1){ + die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($odds_column_index == -1){ + die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; +} +if($pubmed_column_index == -1){ + die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; +} + +my %snp2desc; +while(<GWAS>){ + chomp; + next unless /\S/; + my @F = split /\t/, $_; + my $chr = $F[$chr_column_index]; + $chr =~ s/^chr//; # drop prefix if present + my $strongest_risk_allele = $F[$allele_column_index]; + $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter + $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]]; +} +close(GWAS); + +open(MATCHOUT, ">$outfile") + or die "Cannot open $outfile for writing: $!\n"; + +open(HGVS, $hgvs_file) + or die "Cannot open $hgvs_file for reading: $!\n"; +$header = <HGVS>; +chomp $header; +my @header_columns = split /\t/, $header; +my ($chr_column, $obs_column, $pos_column); +for(my $i = 0; $i <= $#header_columns; $i++){ + if($header_columns[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($header_columns[$i] eq "Obs base"){ + $obs_column = $i; + } + elsif($header_columns[$i] eq "Chr"){ + $chr_column = $i; + } +} +if(not defined $chr_column){ + die "Could not find 'Chr' column in the input header, aborting\n"; +} +if(not defined $pos_column){ + die "Could not find 'DNA From' column in the input header, aborting\n"; +} +if(not defined $obs_column){ + die "Could not find 'Obs base' column in the input header, aborting\n"; +} + +print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n"; +while(<HGVS>){ + chomp; + my @F = split /\t/, $_, -1; + my $allele = $F[$obs_column]; + $allele =~ s(/.*$)(); # ignore ref allele in het calls + my $chr = $F[$chr_column]; + $chr =~ s/^chr//; # remove chr name prefix if present + my $key = $chr.":".$F[$pos_column].":".$allele; + #print STDERR "$key\n"; + if(not exists $snp2desc{$key}){ + print MATCHOUT join("\t", @F, "", "", ""), "\n"; + next; + } + my @matches; + for my $term (@or_terms){ + next if grep {$_ eq $term} @matches; + #print STDERR "Checking match for term $term...\n"; + if($snp2desc{$key}->[0] =~ /\b$term/i or # trait + $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context + $snp2desc{$key}->[2] =~ /\b$term/i){ # study name + push @matches, $term; + last; + } + } + chomp $F[$#F]; + # print all GWAS, whether they match query or not + print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n"; +} +close(HGVS); +close(MATCHOUT); + +sub and_query{ + my ($query) = @_; + if($query =~ /^(.+)\s+and\s+(.+)$/){ + my $t1 = $1; + my $t2 = $2; + my $term1 = and_query($t1); + my $term2 = and_query($t2); + return "($term1.*$term2|$term2.*$term1)"; + } + else{ + return $query; # base case: single term + } + return +}