view filter_by_susceptibility_loci_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

# Reports SNPs associated with susceptibility loci determined by GWAS
use strict;
use warnings;

@ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n";
my $gwas_file = shift @ARGV;
my $hgvs_file = shift @ARGV;
my $outfile = shift @ARGV;
my $pheno_query = shift @ARGV;

$pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace
my $s;
$pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg;
my @or_terms = split /\s+or\s+/i, $pheno_query;
for my $term (@or_terms){
  $term = and_query($term); # recursively convert Boolean AND to equivalent regex
}

# Date Added to Catalog	PUBMEDID	First Author	Date	Journal	Link	Study	Disease/Trait	Initial Sample Size	Replication Sample Size	Region	Chr_id	Chr_pos	Reported Gene(s)	Mapped_gene	Upstream_gene_id	Downstream_gene_id	Snp_gene_ids	Upstream_gene_distance	Downstream_gene_distance	Strongest SNP-Risk Allele	SNPs	Merged	Snp_id_current	Context	Intergenic	Risk Allele Frequency	p-Value	Pvalue_mlog	p-Value (text)	OR or beta	95% CI (text)	Platform [SNPs passing QC]	CNV
open(GWAS, $gwas_file)
  or die "Cannot open GWAS file for reading: $!\n";
my $header = <GWAS>;
chomp $header;
my @columns = split /\t/, $header;
my $trait_column_index = -1;
my $study_column_index = -1;
my $chr_column_index = -1;
my $pos_column_index = -1;
my $allele_column_index = -1;
my $context_column_index = -1;
my $odds_column_index = -1;
my $pubmed_column_index = -1;
for(my $i = 0; $i < $#columns; $i++){
  if($columns[$i] eq "Disease/Trait"){
    $trait_column_index = $i;
  }
  elsif($columns[$i] eq "Study"){
    $study_column_index = $i;
  }
  elsif($columns[$i] eq "Chr_id"){
    $chr_column_index = $i;
  }
  elsif($columns[$i] eq "Chr_pos"){
    $pos_column_index = $i;
  }
  elsif($columns[$i] eq "Strongest SNP-Risk Allele"){
    $allele_column_index = $i;
  }
  elsif($columns[$i] eq "p-Value (text)"){
    $context_column_index = $i;
  }
  elsif($columns[$i] eq "OR or beta"){
    $odds_column_index = $i;
  }
  elsif($columns[$i] eq "PUBMEDID"){
    $pubmed_column_index = $i;
  }
} 
if($trait_column_index == -1){
  die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($study_column_index == -1){
  die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($chr_column_index == -1){
  die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($pos_column_index == -1){
  die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($allele_column_index == -1){
  die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($context_column_index == -1){
  die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($odds_column_index == -1){
  die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}
if($pubmed_column_index == -1){
  die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; 
}

my %snp2desc;
while(<GWAS>){
  chomp;
  next unless /\S/;
  my @F = split /\t/, $_;
  my $chr = $F[$chr_column_index];
  $chr =~ s/^chr//; # drop prefix if present
  my $strongest_risk_allele = $F[$allele_column_index];
  $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter
  $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]];
}
close(GWAS);

open(MATCHOUT, ">$outfile")
  or die "Cannot open $outfile for writing: $!\n";

open(HGVS, $hgvs_file)
  or die "Cannot open $hgvs_file for reading: $!\n";
$header = <HGVS>;
chomp $header;
my @header_columns = split /\t/, $header;
my ($chr_column, $obs_column, $pos_column);
for(my $i = 0; $i <= $#header_columns; $i++){
  if($header_columns[$i] eq "DNA From"){
    $pos_column = $i;
  }
  elsif($header_columns[$i] eq "Obs base"){
    $obs_column = $i;
  }
  elsif($header_columns[$i] eq "Chr"){
    $chr_column = $i;
  }
}
if(not defined $chr_column){
  die "Could not find 'Chr' column in the input header, aborting\n";
}
if(not defined $pos_column){
  die "Could not find 'DNA From' column in the input header, aborting\n";
}
if(not defined $obs_column){
  die "Could not find 'Obs base' column in the input header, aborting\n";
}

print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n";
while(<HGVS>){
  chomp;
  my @F = split /\t/, $_, -1;
  my $allele = $F[$obs_column];
  $allele =~ s(/.*$)(); # ignore ref allele in het calls
  my $chr = $F[$chr_column];
  $chr =~ s/^chr//; # remove chr name prefix if present
  my $key = $chr.":".$F[$pos_column].":".$allele;
  #print STDERR "$key\n";
  if(not exists $snp2desc{$key}){
    print MATCHOUT join("\t", @F, "", "", ""), "\n";
    next;
  }
  my @matches;
  for my $term (@or_terms){
    next if grep {$_ eq $term} @matches;
    #print STDERR "Checking match for term $term...\n";
    if($snp2desc{$key}->[0] =~ /\b$term/i or # trait
       $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context
       $snp2desc{$key}->[2] =~ /\b$term/i){  # study name
      push @matches, $term;
      last;
    }
  }
  chomp $F[$#F];
  # print all GWAS, whether they match query or not
  print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n";
}
close(HGVS);
close(MATCHOUT);

sub and_query{
  my ($query) = @_;
  if($query =~ /^(.+)\s+and\s+(.+)$/){
    my $t1 = $1;
    my $t2 = $2;
    my $term1 = and_query($t1);
    my $term2 = and_query($t2);
    return "($term1.*$term2|$term2.*$term1)";
  }
  else{
    return $query; # base case: single term
  }
  return 
}