diff master2pg.pl @ 2:35c20b109be5

Retrying upload with "bare" tarball (i.e. one without a top containing directory).
author cathy
date Tue, 28 May 2013 17:54:02 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/master2pg.pl	Tue May 28 17:54:02 2013 -0400
@@ -0,0 +1,157 @@
+#!/usr/bin/perl -w
+use strict;
+
+#convert from master variant file to pgSnp
+my $snpsOnly = 1; #flag for if doing SNPs or indels
+my $addRef = 0;   #flag whether to append ref allele
+if (@ARGV && $ARGV[0] eq 'indel') { shift @ARGV; $snpsOnly = 0; }
+if (@ARGV && $ARGV[0] eq 'addRef') { shift @ARGV; $addRef = 1; }
+if (!@ARGV) {
+   print STDERR "usage: master2pg.pl [indel addRef] infile.masterVar > outfile.pgSnp\n";
+   exit;
+}
+my $in = shift @ARGV;
+open(FH, $in) or die "Couldn't open input file $in, $!\n";
+
+while (<FH>) {
+   chomp;
+   if (/^#/) { next; }
+   if (/^>/) { next; } #headers
+   if (/^\s*$/) { next; } 
+   my @f = split(/\t/);
+   if (!$f[5]) { next; } #WHAT? most likely still zipped?
+   if ($f[5] =~ /(hom|het)/) { #zygosity #haploid chrX and chrY?
+      #only get snps for now
+      if ($snpsOnly && $f[6] eq 'snp') { #varType
+         my $a;
+         my $c = 2;
+         my $freq = 0; #set defaults
+         my $sc = 0;
+         if ($f[8] eq $f[9]) { #should be homozygous?
+            $a = $f[8];
+            $c = 1;
+         }else {
+            $a = "$f[8]/$f[9]";
+            $sc = "0,0";  #set defaults in case not included
+            $freq = '0,0';
+         }
+         if (defined $f[10] && $f[10] eq '') { $f[10] = 0; }
+         if (defined $f[11] && $f[11] eq '') { $f[11] = 0; }
+         if (defined $f[10] && $c == 1) {
+            $sc = $f[10];
+         }elsif (defined $f[10] && defined $f[11] && $c == 2) {
+            $sc = "$f[10],$f[11]";
+         }
+         if (defined $f[16] && $f[16] eq '') { $f[16] = 0; }
+         if (defined $f[17] && $f[17] eq '') { $f[17] = 0; }
+         if (defined $f[16] && $c == 1) {
+            $freq = $f[16];
+         }elsif (defined $f[16] && defined $f[17] && $c == 2) {
+            $freq = "$f[16],$f[17]";
+         }
+         print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc";
+         if ($addRef) { print "\t$f[7]\n"; }
+         else { print "\n"; }
+      }elsif (!$snpsOnly) {
+         if ($f[8] =~ /^\s*$/) { undef $f[8]; }
+         if ($f[9] =~ /^\s*$/) { undef $f[9]; }
+         my $a;
+         my $c = 2;
+         #do indels
+         if ($f[6] eq "ins") {
+            if (defined $f[8] && defined $f[9] && $f[8] eq $f[9]) { $a = $f[8]; $c = 1; }
+            elsif (defined $f[8] && defined $f[9] && $f[8] ne '?' && $f[9] ne '?') { 
+               $a = "$f[8]/$f[9]";
+            }elsif (!defined $f[8] && defined $f[9]) {
+               $a = "$f[9]/-";
+            }elsif (defined $f[8] && !defined $f[9]) {
+               $a = "$f[8]/-";
+            }
+         }elsif ($f[6] eq "del") {
+            if (!defined $f[8] && !defined $f[9]) {
+               $a = '-'; #homozygous deletion
+               $c = 1;
+            }elsif (!defined $f[8] && defined $f[9]) {
+               $a = "$f[9]/-";
+            }elsif (defined $f[8] && !defined $f[9]) {
+               $a = "$f[8]/-";
+            }            
+         }elsif ($f[6] eq "sub") { #multiple nt substitutions
+            if ($f[8] eq $f[9]) {
+               $a = $f[8];
+               $c = 1;
+            }else {
+               $a = "$f[8]/$f[9]";
+            }
+         }elsif ($f[6] eq "complex") { #treat same as multi-nt sub
+            if ($f[5] =~ /het-alt/ && !defined $f[8]) { $f[8] = '-'; }
+            if ($f[5] =~ /het-alt/ && !defined $f[9]) { $f[9] = '-'; }
+            if (defined $f[8] && defined $f[9] && $f[8] eq $f[9]) {
+               $c = 1;
+               $a = $f[8];
+            }elsif (defined $f[8] && defined $f[9]) {
+               $a = "$f[8]/$f[9]";
+            }
+         }
+         my $sc = '0';
+         my $freq = '0';
+         if ($c == 2) { $sc = '0,0'; $freq = '0,0'; }
+         if (defined $f[10] && $f[10] eq '') { $f[10] = 0; }
+         if (defined $f[11] && $f[11] eq '') { $f[11] = 0; }
+         if (defined $f[10] && $c == 1) {
+            $sc = $f[10];
+         }elsif (defined $f[10] && defined $f[11] && $c == 2) {
+            $sc = "$f[10],$f[11]";
+         }
+         if (defined $f[16] && $f[16] eq '') { $f[16] = 0; }
+         if (defined $f[17] && $f[17] eq '') { $f[17] = 0; }
+         if (defined $f[16] && $c == 1) {
+            if ($f[16] eq '') { $f[16] = 0; }
+            $freq = $f[16];
+         }elsif (defined $f[16] && defined $f[17] && $c == 2) {
+            $freq = "$f[16],$f[17]";
+         }
+         if ($a) {
+            print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc";
+            if ($addRef) { print "\t$f[7]\n"; }
+            else { print "\n"; }
+         }
+      }
+   }elsif ($f[5] eq 'hap' && $f[6] eq 'snp' && $snpsOnly) {
+      my $c = 1;
+      my $freq = '0';
+      if (defined $f[10] && $f[10] ne '') { $freq = $f[10]; }
+      my $sc = '0';
+      if (defined $f[16] && $f[16] ne '') { $sc = $f[16]; }
+      if ($f[8]) {
+         print "$f[2]\t$f[3]\t$f[4]\t$f[8]\t$c\t$freq\t$sc";
+         if ($addRef) { print "\t$f[7]\n"; }
+         else { print "\n"; }
+      }   
+   }elsif ($f[5] eq 'hap' && !$snpsOnly && $f[6] =~ /(del|ins|sub)/) {
+      if ($f[8] =~ /^\s*$/) { undef $f[8]; }
+      my $a;
+      my $c = 1;
+      #do indels
+      if ($f[6] eq "ins") {
+         $a = $f[8]; 
+      }elsif ($f[6] eq "del") {
+         $a = '-'; #deletion
+      }elsif ($f[6] eq "sub") { #multiple nt substitutions
+         $a = $f[8];
+      }
+      my $sc = '0';
+      my $freq = '0';
+      if (defined $f[10] && $f[10] ne '') { $sc = $f[10]; }
+      if (defined $f[16] && $f[16] ne '') { $freq = $f[16]; }
+      if ($a) {
+         print "$f[2]\t$f[3]\t$f[4]\t$a\t$c\t$freq\t$sc";
+         if ($addRef) { print "\t$f[7]\n"; }
+         else { print "\n"; }
+      }
+   }
+}
+
+close FH or die "Couldn't close $in, $!\n";
+
+exit;