diff filter_by_susceptibility_loci_pipe @ 0:6411ca16916e default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:23:29 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_susceptibility_loci_pipe	Wed Mar 25 13:23:29 2015 -0600
@@ -0,0 +1,174 @@
+#!/usr/bin/env perl
+
+# Reports SNPs associated with susceptibility loci determined by GWAS
+use strict;
+use warnings;
+
+@ARGV == 4 or die "Usage: $0 <GWAS db.txt> <input.annotated.hgvs.txt> <output.txt> <pheno query>\n";
+my $gwas_file = shift @ARGV;
+my $hgvs_file = shift @ARGV;
+my $outfile = shift @ARGV;
+my $pheno_query = shift @ARGV;
+
+$pheno_query =~ s/^\s+|\s+$//g; # trim leading and trailing whitespace
+my $s;
+$pheno_query =~ s/\(.*?\)/$s=$&; $s=~s#\s+or\s+#|#g; $s/eg;
+my @or_terms = split /\s+or\s+/i, $pheno_query;
+for my $term (@or_terms){
+  $term = and_query($term); # recursively convert Boolean AND to equivalent regex
+}
+
+# Date Added to Catalog	PUBMEDID	First Author	Date	Journal	Link	Study	Disease/Trait	Initial Sample Size	Replication Sample Size	Region	Chr_id	Chr_pos	Reported Gene(s)	Mapped_gene	Upstream_gene_id	Downstream_gene_id	Snp_gene_ids	Upstream_gene_distance	Downstream_gene_distance	Strongest SNP-Risk Allele	SNPs	Merged	Snp_id_current	Context	Intergenic	Risk Allele Frequency	p-Value	Pvalue_mlog	p-Value (text)	OR or beta	95% CI (text)	Platform [SNPs passing QC]	CNV
+open(GWAS, $gwas_file)
+  or die "Cannot open GWAS file for reading: $!\n";
+my $header = <GWAS>;
+chomp $header;
+my @columns = split /\t/, $header;
+my $trait_column_index = -1;
+my $study_column_index = -1;
+my $chr_column_index = -1;
+my $pos_column_index = -1;
+my $allele_column_index = -1;
+my $context_column_index = -1;
+my $odds_column_index = -1;
+my $pubmed_column_index = -1;
+for(my $i = 0; $i < $#columns; $i++){
+  if($columns[$i] eq "Disease/Trait"){
+    $trait_column_index = $i;
+  }
+  elsif($columns[$i] eq "Study"){
+    $study_column_index = $i;
+  }
+  elsif($columns[$i] eq "Chr_id"){
+    $chr_column_index = $i;
+  }
+  elsif($columns[$i] eq "Chr_pos"){
+    $pos_column_index = $i;
+  }
+  elsif($columns[$i] eq "Strongest SNP-Risk Allele"){
+    $allele_column_index = $i;
+  }
+  elsif($columns[$i] eq "p-Value (text)"){
+    $context_column_index = $i;
+  }
+  elsif($columns[$i] eq "OR or beta"){
+    $odds_column_index = $i;
+  }
+  elsif($columns[$i] eq "PUBMEDID"){
+    $pubmed_column_index = $i;
+  }
+} 
+if($trait_column_index == -1){
+  die "Could not find Trait column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($study_column_index == -1){
+  die "Could not find Study column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($chr_column_index == -1){
+  die "Could not find chromosome name column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($pos_column_index == -1){
+  die "Could not find chromosomal position column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($allele_column_index == -1){
+  die "Could not find risk allele column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($context_column_index == -1){
+  die "Could not find p-value context column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($odds_column_index == -1){
+  die "Could not find odds ratio column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+if($pubmed_column_index == -1){
+  die "Could not find PubMed ID column header in provided GWAS catalog file $gwas_file, aborting\n"; 
+}
+
+my %snp2desc;
+while(<GWAS>){
+  chomp;
+  next unless /\S/;
+  my @F = split /\t/, $_;
+  my $chr = $F[$chr_column_index];
+  $chr =~ s/^chr//; # drop prefix if present
+  my $strongest_risk_allele = $F[$allele_column_index];
+  $strongest_risk_allele =~ s/^.*-//; # initially looks like rs4345897-G, strip to just letter
+  $snp2desc{$chr.":".$F[$pos_column_index].":".$strongest_risk_allele} = [$F[$trait_column_index], $F[$context_column_index], $F[$study_column_index], $F[$odds_column_index], $F[$pubmed_column_index]];
+}
+close(GWAS);
+
+open(MATCHOUT, ">$outfile")
+  or die "Cannot open $outfile for writing: $!\n";
+
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+$header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my ($chr_column, $obs_column, $pos_column);
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "DNA From"){
+    $pos_column = $i;
+  }
+  elsif($header_columns[$i] eq "Obs base"){
+    $obs_column = $i;
+  }
+  elsif($header_columns[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Could not find 'Chr' column in the input header, aborting\n";
+}
+if(not defined $pos_column){
+  die "Could not find 'DNA From' column in the input header, aborting\n";
+}
+if(not defined $obs_column){
+  die "Could not find 'Obs base' column in the input header, aborting\n";
+}
+
+print MATCHOUT $header, "\tPhenotype GWAS Catalog text matches (matching $pheno_query)\tGWAS odds ratio\tGWAS susceptibility description for this SNP\n";
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my $allele = $F[$obs_column];
+  $allele =~ s(/.*$)(); # ignore ref allele in het calls
+  my $chr = $F[$chr_column];
+  $chr =~ s/^chr//; # remove chr name prefix if present
+  my $key = $chr.":".$F[$pos_column].":".$allele;
+  #print STDERR "$key\n";
+  if(not exists $snp2desc{$key}){
+    print MATCHOUT join("\t", @F, "", "", ""), "\n";
+    next;
+  }
+  my @matches;
+  for my $term (@or_terms){
+    next if grep {$_ eq $term} @matches;
+    #print STDERR "Checking match for term $term...\n";
+    if($snp2desc{$key}->[0] =~ /\b$term/i or # trait
+       $snp2desc{$key}->[1] =~ /\b$term/i or # p-value context
+       $snp2desc{$key}->[2] =~ /\b$term/i){  # study name
+      push @matches, $term;
+      last;
+    }
+  }
+  chomp $F[$#F];
+  # print all GWAS, whether they match query or not
+  print MATCHOUT join("\t", @F, join("; ", sort @matches), $snp2desc{$key}->[3], "PubMedID ".$snp2desc{$key}->[4].": trait '".$snp2desc{$key}->[0]."'".($snp2desc{$key}->[1] =~ /\S/ ? " in context of '".$snp2desc{$key}->[1]."'" : "").". Study: ".$snp2desc{$key}->[2]), "\n";
+}
+close(HGVS);
+close(MATCHOUT);
+
+sub and_query{
+  my ($query) = @_;
+  if($query =~ /^(.+)\s+and\s+(.+)$/){
+    my $t1 = $1;
+    my $t2 = $2;
+    my $term1 = and_query($t1);
+    my $term2 = and_query($t2);
+    return "($term1.*$term2|$term2.*$term1)";
+  }
+  else{
+    return $query; # base case: single term
+  }
+  return 
+}