Mercurial > repos > miller-lab > snp_analysis_conversion

diff master2gd_snp.pl @ 2:35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
author: cathy
date: Tue, 28 May 2013 17:54:02 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/master2gd_snp.pl	Tue May 28 17:54:02 2013 -0400
@@ -0,0 +1,221 @@
+#!/usr/bin/perl -w
+use strict;
+
+#convert from master variant file to snp table (Webb format?)
+#new format for version 2.0, also different format for cancer normal pairs
+#set columns for 2.0 version Cancer format
+my $aCnt1 = 21;
+my $aCnt2 = 22;
+
+#snp table format:
+#1. chr
+#2. position (0 based)
+#3. ref allele
+#4. second allele
+#5. overall quality
+#foreach individual (6-9, 10-13, ...)
+#a. count of allele in 3
+#b. count of allele in 4
+#c. genotype call (-1, or count of ref allele)
+#d. quality of genotype call (quality of non-ref allele from masterVar)
+
+if (!@ARGV) {
+   print "usage: master2gd_snp.pl masterVar.txt[.gz|.bz2] [-tab=snpTable.txt -addColsOnly -build=hg19 -name=na ] > newSnpTable.txt\n";
+   exit;
+}
+
+my $in = shift @ARGV;
+my $tab;
+my $tabOnly;
+my $build;
+my $name;
+foreach (@ARGV) {
+   if (/-tab=(.*)/) { $tab = $1; }
+   elsif (/-addColsOnly/) { $tabOnly = 1; }
+   elsif (/-build=(.*)/) { $build = $1; }
+   elsif (/-name=(.*)/) { $name = $1; }
+}
+
+#WARNING loads snp table in memory, this could take > 1G ram
+my %old;
+my $colcnt = 0;
+my @head;
+if ($tab) {
+   open(FH, $tab) or die "Couldn't open $tab, $!\n";
+   while (<FH>) {
+      chomp;
+      if (/^#/) { push(@head, $_); next; }
+      my @f = split(/\t/);
+      $old{"$f[0]:$f[1]"} = join("\t", @f);
+      $colcnt = scalar @f;
+   }
+   close FH or die "Couldn't close $tab, $!\n";
+}
+
+if ($in =~ /.gz$/) { 
+   open(FH, "zcat $in |") or die "Couldn't open $in, $!\n";
+}elsif ($in =~ /.bz2$/) {
+   open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n";
+}else {
+   open(FH, $in) or die "Couldn't open $in, $!\n";
+}
+prepHeader();
+if (@head) { #keep old header, add new?
+   print join("\n", @head), "\n";
+}
+while (<FH>) {
+   chomp;
+   #FORMAT_VERSION 2.0
+   if (/^#FORMAT_VERSION\s+1\./) { 
+      $aCnt1 = 16;
+      $aCnt2 = 17;
+   }
+   if (/^#/) { next; }
+   if (/^>/) { next; } #headers
+   if (/^\s*$/) { next; } 
+   my @f = split(/\t/);
+   if (!$f[6]) { next; } #WHAT? most likely still zipped?
+   if ($f[6] ne 'snp') { next; } #table only has substitutions
+   if ($f[5] eq 'het-alt') { next; } #skip heterozygous with no ref match
+   if ($f[5] =~ /(hom|het)/) { #zygosity #haploid chrX and chrY?
+         my $a = $f[7];  #reference allele
+         my $a2;
+         my $freq;
+         my $freq2;
+         my $sc = -1;
+         my $alt;
+         my $g = 1; #genotype == ref allele count
+         if ($f[8] eq $f[9]) { #should be homozygous?
+            $a2 = $f[8];
+            $g = 0;
+            if ($f[10] && $f[10] ne '') { $sc = $f[10]; }#is this the best one to use? or smallest?
+         }else {
+            if ($a ne $f[8]) { $a2 = $f[8]; $alt = 8; }
+            elsif ($a ne $f[9]) { $a2 = $f[9]; $alt = 9; }
+         }
+         if (defined $f[10] && defined $f[11] && $alt) { #VAF score in 2.0 format
+            if ($f[$alt+2] && $f[$alt+2] ne '') { $sc = $f[$alt+2]; }
+         }
+	 #version 1.12 columns 16 & 17, version 2.0 Cancer columns 21 & 22
+         if (defined $f[$aCnt1] && defined $f[$aCnt2] && $alt) {
+            if ($alt == 8) { 
+                $freq = $f[$aCnt2];
+                $freq2 = $f[$aCnt1];
+            }elsif ($alt == 9) {
+                $freq = $f[$aCnt1];
+                $freq2 = $f[$aCnt2];
+            }
+         }elsif (defined $f[$aCnt1]) {
+            $freq = 0;
+            $freq2 = $f[$aCnt1];
+         }
+         #if starting a new table or new SNP in old table
+         #add option to only build on current table?
+	 if (!$tab) { 
+            print "$f[2]\t$f[3]\t$a\t$a2\t-1"; 
+         }elsif (!$tabOnly && !exists $old{"$f[2]:$f[3]"}) {
+            print "$f[2]\t$f[3]\t$a\t$a2\t-1";
+         }elsif (exists $old{"$f[2]:$f[3]"}) {
+            print $old{"$f[2]:$f[3]"};
+            $old{"$f[2]:$f[3]"} = ''; #unset so we know it is printed
+         }elsif ($tabOnly && !exists $old{"$f[2]:$f[3]"}) {
+            next; #skip this one entirely
+         }
+         if ($colcnt && !exists $old{"$f[2]:$f[3]"}) { 
+            #new SNP pad for missing individuals
+            my $i = 5;
+            while ($i < $colcnt) {
+               print "\t-1\t-1\t-1\t-1";
+               $i += 4;
+            }
+         }
+         #add columns for individual
+         print "\t$freq\t$freq2\t$g\t$sc\n";
+   }elsif ($f[5] eq 'hap') {
+      my $g = 0;
+      my $freq = 0;
+      my $freq2 = 0; 
+      if (defined $f[10]) { $freq2 = $f[10]; }
+      my $sc = -1;
+      if (defined $f[$aCnt1] && $f[$aCnt1] ne '') { $sc = $f[$aCnt1]; }
+      if ($f[8]) {
+         if (!$tab) {
+            print "$f[2]\t$f[3]\t$f[7]\t$f[8]\t-1";
+         }elsif (!$tabOnly && !exists $old{"$f[2]:$f[3]"}) {
+            print "$f[2]\t$f[3]\t$f[7]\t$f[8]\t-1";
+         }elsif (exists $old{"$f[2]:$f[3]"}) {
+            print $old{"$f[2]:$f[3]"};
+            $old{"$f[2]:$f[3]"} = ''; #unset so we know it is printed
+         }elsif ($tabOnly && !exists $old{"$f[2]:$f[3]"}) {
+            next; #skip this one entirely
+         }
+         if ($colcnt && !exists $old{"$f[2]:$f[3]"}) {
+            #new SNP pad for missing individuals
+            my $i = 5;
+            while ($i < $colcnt) {
+               print "\t-1\t-1\t-1\t-1";
+               $i += 4;
+            }
+         }
+         #add columns for individual
+         print "\t$freq\t$freq2\t$g\t$sc\n";
+      }   
+   }
+}
+close FH or die "Couldn't close $in, $!\n";
+
+#if adding to a snp table, now we need to finish those not in the latest set
+foreach my $k (keys %old) {
+   if ($old{$k} ne '') { #not printed yet
+      print $old{$k}, "\t-1\t-1\t-1\t-1\n"; #plus blank for this one
+   }
+}
+
+exit;
+
+#parse old header and add or create new
+sub prepHeader {
+   if (!$build) { $build = 'hg19'; } #set default
+   my @cnames;
+   my @ind;
+   my $n;
+   if (@head) { #parse previous header
+      my $h = join("", @head); #may split between lines
+      if ($h =~ /"column_names":\[(.*?)\]/) {
+         my @t = split(/,/, $1);
+         foreach (@t) { s/"//g; }
+         @cnames = @t;
+         $n = $cnames[$#cnames];
+         $n =~ s/Q//;
+         $n++;
+      }
+      if ($h =~ /"dbkey":"(.*?)"/) { $build = $1; }
+      if ($h =~ /"individuals":\[(.*)\]/) {
+         my $t = $1;
+         $t =~ s/\]\].*/]/; #remove if there is more categories
+         @ind = split(/,/, $t);
+      } 
+   }else { #start new header
+      @cnames = ("chr", "pos", "A", "B", "Q");   
+      $n = 1;
+   }
+   #add current
+   if (!$name) { $name= 'na'; }
+   my $stcol = $colcnt + 1;
+   if ($stcol == 1) { $stcol = 6; } #move past initial columns
+   push(@ind, "[\"$name\",$stcol]");
+   push(@cnames, "${n}A", "${n}B", "${n}G", "${n}Q");
+   #reassign head
+   undef @head;
+   foreach (@cnames) { $_ = "\"$_\""; } #quote name
+   $head[0] = "#{\"column_names\":[" . join(",", @cnames) . "],";
+   $head[1] = "#\"individuals\":[" . join(",", @ind) . "],"; 
+   $head[2] = "#\"dbkey\":\"$build\",\"pos\":2,\"rPos\":2,\"ref\":1,\"scaffold\":1,\"species\":\"$build\"}";
+}
+####End
+
+##example header 
+#{"column_names":["chr","pos","A","B","Q","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","7A","7B","7G","7Q","8A","8B","8G",
+#"8Q","9A","9B","9G","9Q","10A","10B","10G","10Q"],"dbkey":"hg19","individuals":[["Boh_15M",6],["Boh_19M",10],["Paya_27F",14],["Paya_2F",18],["Paya_32F",22],["Ruil_2M",26],["Ruil_36M",30],["Ruil_3M",
+#34],["Ruil_40",38],["Ruil_47F",42]],"pos":2,"rPos":2,"ref":1,"scaffold":1,"species":"hg19"}
+#chr1	10290	C	T	46.4	0	2	0	7	1	2	0	4	3	2	1	22	0	0	-1	0	1	0	1	4	0	2	0	7	0	0	-1	0	2	3	1	14	0	1	0	4	1	1	1	6
author	cathy
date	Tue, 28 May 2013 17:54:02 -0400
parents
children