| 
0
 | 
     1 #!/usr/bin/perl -w 
 | 
| 
 | 
     2 use strict;
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 #########################################################################
 | 
| 
 | 
     5 #	codingSnps.pl
 | 
| 
 | 
     6 #	This takes a bed file with the names being / separated nts
 | 
| 
 | 
     7 #	and a gene bed file with cds start and stop.
 | 
| 
 | 
     8 #	It then checks for changes in coding regions, reporting
 | 
| 
 | 
     9 #	those that cause a frameshift or substitution in the amino acid.
 | 
| 
 | 
    10 #########################################################################
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib
 | 
| 
 | 
    13 if (!@ARGV or scalar @ARGV < 3) {
 | 
| 
 | 
    14    print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# keepColumns=1] > codingSnps.txt\n";
 | 
| 
 | 
    15    exit;
 | 
| 
 | 
    16 }
 | 
| 
 | 
    17 my $uniq = 0; #flag for whether want uniq positions
 | 
| 
 | 
    18 my $syn = 0;  #flag for if want synonomous changes rather than non-syn
 | 
| 
 | 
    19 my $keep = 0; #keep old columns and append new ones
 | 
| 
 | 
    20 my $snpFile = shift @ARGV;
 | 
| 
 | 
    21 my $geneFile = shift @ARGV;
 | 
| 
 | 
    22 my $nibDir = shift @ARGV;  #2bit or nib, depending on flag above
 | 
| 
 | 
    23 if ($nibDir eq 'Galaxy') { getGalaxyInfo(); }
 | 
| 
 | 
    24 my $col0 = 0; #bed like columns in default positions
 | 
| 
 | 
    25 my $col1 = 1;
 | 
| 
 | 
    26 my $col2 = 2;
 | 
| 
 | 
    27 my $col3 = 3;
 | 
| 
 | 
    28 #column positions 1 based coming in (for Galaxy)
 | 
| 
 | 
    29 foreach (@ARGV) {
 | 
| 
 | 
    30    if (/chr=(\d+)/) { $col0 = $1 -1; }
 | 
| 
 | 
    31    elsif (/start=(\d+)/) { $col1 = $1 -1; }
 | 
| 
 | 
    32    elsif (/end=(\d+)/) { $col2 = $1 -1; }
 | 
| 
 | 
    33    elsif (/snp=(\d+)/) { $col3 = $1 -1; }
 | 
| 
 | 
    34    elsif (/keepColumns=1/) { $keep = 1; }
 | 
| 
 | 
    35 }
 | 
| 
 | 
    36 if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {
 | 
| 
 | 
    37    print STDERR "ERROR column numbers are given with origin 1\n";
 | 
| 
 | 
    38    exit 1;
 | 
| 
 | 
    39 }
 | 
| 
 | 
    40 my @genes; #bed lines for genes, sorted by chrom and start
 | 
| 
 | 
    41 my %chrSt; #index in array where each chrom starts
 | 
| 
 | 
    42 my %codon; #hash of codon amino acid conversions
 | 
| 
 | 
    43 my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
 | 
| 
 | 
    44 my $ignoreN = 1; #skip N
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 my %amb = (
 | 
| 
 | 
    47 "R" => "A/G",
 | 
| 
 | 
    48 "Y" => "C/T",
 | 
| 
 | 
    49 "S" => "C/G",
 | 
| 
 | 
    50 "W" => "A/T",
 | 
| 
 | 
    51 "K" => "G/T",
 | 
| 
 | 
    52 "M" => "A/C",
 | 
| 
 | 
    53 "B" => "C/G/T",
 | 
| 
 | 
    54 "D" => "A/G/T",
 | 
| 
 | 
    55 "H" => "A/C/T",
 | 
| 
 | 
    56 "V" => "A/C/G",
 | 
| 
 | 
    57 "N" => "A/C/G/T"
 | 
| 
 | 
    58 );
 | 
| 
 | 
    59 fill_codon();
 | 
| 
 | 
    60 open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") 
 | 
| 
 | 
    61    or die "Couldn't open and sort $geneFile, $!\n";
 | 
| 
 | 
    62 my $i = 0;
 | 
| 
 | 
    63 while(<FH>) {
 | 
| 
 | 
    64    chomp;
 | 
| 
 | 
    65    if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }
 | 
| 
 | 
    66    push(@genes, "$_");
 | 
| 
 | 
    67    my @f = split(/\t/);
 | 
| 
 | 
    68    if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
 | 
| 
 | 
    69    $i++;
 | 
| 
 | 
    70 }
 | 
| 
 | 
    71 close FH or die "Couldn't close $geneFile, $!\n";
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; }
 | 
| 
 | 
    74 
 | 
| 
 | 
    75 #open snps sorted as well
 | 
| 
 | 
    76 my $s1 = $col0 + 1; #sort order is origin 1
 | 
| 
 | 
    77 my $s2 = $col1 + 1; 
 | 
| 
 | 
    78 open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")
 | 
| 
 | 
    79    or die "Couldn't open and sort $snpFile, $!\n";
 | 
| 
 | 
    80 $i = 0;
 | 
| 
 | 
    81 my @g; #one genes fields, should be used repeatedly
 | 
| 
 | 
    82 my %done;
 | 
| 
 | 
    83 while(<FH>) {
 | 
| 
 | 
    84    chomp;
 | 
| 
 | 
    85    if (/^\s*#/) { next; } #comment
 | 
| 
 | 
    86    my @s = split(/\t/); #SNP fields
 | 
| 
 | 
    87    if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
 | 
| 
 | 
    88    my $size = $#s;
 | 
| 
 | 
    89    if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {
 | 
| 
 | 
    90       print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
 | 
| 
 | 
    91       exit 1;
 | 
| 
 | 
    92    }
 | 
| 
 | 
    93    if ($s[$col1] =~ /\D/) { 
 | 
| 
 | 
    94       print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
 | 
| 
 | 
    95       exit 1;
 | 
| 
 | 
    96    }
 | 
| 
 | 
    97    if ($s[$col2] =~ /\D/) {
 | 
| 
 | 
    98       print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
 | 
| 
 | 
    99       exit 1;
 | 
| 
 | 
   100    }
 | 
| 
 | 
   101    if ($s[$col3] eq 'N' && $ignoreN) { next; }
 | 
| 
 | 
   102    if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
 | 
| 
 | 
   103    if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
 | 
| 
 | 
   104       $i = $chrSt{$s[$col0]};
 | 
| 
 | 
   105       @g = split(/\t/, $genes[$i]);
 | 
| 
 | 
   106       if (scalar @g < 12) {  
 | 
| 
 | 
   107          print STDERR "ERROR the gene file must be the whole genes in BED format\n";
 | 
| 
 | 
   108          exit 1;
 | 
| 
 | 
   109       }
 | 
| 
 | 
   110    }elsif (!@g) { 
 | 
| 
 | 
   111       next; #no gene for this chrom
 | 
| 
 | 
   112    }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom 
 | 
| 
 | 
   113       $i = $chrSt{$s[$col0]};
 | 
| 
 | 
   114       @g = split(/\t/, $genes[$i]);
 | 
| 
 | 
   115    }elsif ($s[$col0] ne $g[0]) {
 | 
| 
 | 
   116       next; #no gene for this chrom
 | 
| 
 | 
   117    }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
 | 
| 
 | 
   118       next; #before any genes
 | 
| 
 | 
   119    }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
 | 
| 
 | 
   120       next; #after all genes on chr
 | 
| 
 | 
   121    }else {
 | 
| 
 | 
   122       while ($s[$col1] > $g[2] && $i < $#genes) {
 | 
| 
 | 
   123          $i++;
 | 
| 
 | 
   124          @g = split(/\t/, $genes[$i]);
 | 
| 
 | 
   125          if ($s[$col0] ne $g[0]) { last; } #end of gene
 | 
| 
 | 
   126       }
 | 
| 
 | 
   127       if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
 | 
| 
 | 
   128          next; #no overlap with genes
 | 
| 
 | 
   129       }
 | 
| 
 | 
   130    }
 | 
| 
 | 
   131 
 | 
| 
 | 
   132    processSnp(\@s, \@g);
 | 
| 
 | 
   133    if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
 | 
| 
 | 
   134 
 | 
| 
 | 
   135    my $k = $i + 1; #check for more genes without losing data of first
 | 
| 
 | 
   136    if ($k <= $#genes) {
 | 
| 
 | 
   137       my @g2 = split(/\t/, $genes[$k]);
 | 
| 
 | 
   138       while (@g2 && $k <= $#genes) {
 | 
| 
 | 
   139          @g2 = split(/\t/, $genes[$k]);
 | 
| 
 | 
   140          if ($s[$col0] ne $g2[0]) {
 | 
| 
 | 
   141             undef @g2;
 | 
| 
 | 
   142             last; #not same chrom
 | 
| 
 | 
   143          }else {
 | 
| 
 | 
   144             while ($s[$col1] > $g2[2] && $k <= $#genes) {
 | 
| 
 | 
   145                $k++;
 | 
| 
 | 
   146                @g2 = split(/\t/, $genes[$k]);
 | 
| 
 | 
   147                if ($s[$col0] ne $g2[0]) { last; } #end of chrom
 | 
| 
 | 
   148             }
 | 
| 
 | 
   149             if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
 | 
| 
 | 
   150                undef @g2;
 | 
| 
 | 
   151                last; #no overlap with more genes
 | 
| 
 | 
   152             }
 | 
| 
 | 
   153             processSnp(\@s, \@g2);
 | 
| 
 | 
   154             if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
 | 
| 
 | 
   155          }      
 | 
| 
 | 
   156          $k++;
 | 
| 
 | 
   157       }
 | 
| 
 | 
   158    }
 | 
| 
 | 
   159 }
 | 
| 
 | 
   160 close FH or die "Couldn't close $snpFile, $!\n";
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 exit;
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 ########################################################################
 | 
| 
 | 
   165 sub processSnp {
 | 
| 
 | 
   166    my $sref = shift;
 | 
| 
 | 
   167    my $gref = shift;
 | 
| 
 | 
   168    #overlaps gene, but maybe not coding seq
 | 
| 
 | 
   169    #inside cds
 | 
| 
 | 
   170    if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
 | 
| 
 | 
   171       return; #outside of coding 
 | 
| 
 | 
   172    }
 | 
| 
 | 
   173    #now check exon
 | 
| 
 | 
   174    my $i = 0;
 | 
| 
 | 
   175    my @st = split(/,/, $gref->[11]);
 | 
| 
 | 
   176    my @size = split(/,/, $gref->[10]);
 | 
| 
 | 
   177    if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; }
 | 
| 
 | 
   178    my @pos;
 | 
| 
 | 
   179    my $in = 0;
 | 
| 
 | 
   180    for($i = 0; $i < $gref->[9]; $i++) {
 | 
| 
 | 
   181       my $sta = $gref->[1] + $st[$i] + 1; #1 based position
 | 
| 
 | 
   182       my $end = $sta + $size[$i] - 1; #
 | 
| 
 | 
   183       if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
 | 
| 
 | 
   184       if ($end < $gref->[6]) { next; } #utr only
 | 
| 
 | 
   185       if ($sta > $gref->[7]) { next; } #utr only
 | 
| 
 | 
   186       #shorten to coding only
 | 
| 
 | 
   187       if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
 | 
| 
 | 
   188       if ($end > $gref->[7]) { $end = $gref->[7]; }
 | 
| 
 | 
   189       if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
 | 
| 
 | 
   190       elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
 | 
| 
 | 
   191       push(@pos, ($sta .. $end)); #add exon worth of positions
 | 
| 
 | 
   192    }
 | 
| 
 | 
   193    #@pos has coding positions for whole gene (chr coors), 
 | 
| 
 | 
   194    #and $in has whether we need to continue
 | 
| 
 | 
   195    if (!$in) { return; } #not in coding exon
 | 
| 
 | 
   196    if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
 | 
| 
 | 
   197    if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
 | 
| 
 | 
   198        $sref->[$col3] =~ /^-+$/) { #indel or del
 | 
| 
 | 
   199       my $copy = $sref->[$col3];
 | 
| 
 | 
   200       my $c = ($copy =~ tr/-//);
 | 
| 
 | 
   201       if ($c % 3 == 0) { return; } #not frameshift 
 | 
| 
 | 
   202       #handle bed4 or any interval file
 | 
| 
 | 
   203       if (!$keep) {
 | 
| 
 | 
   204          print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
 | 
| 
 | 
   205          print "\t$gref->[3]\tframeshift\n";
 | 
| 
 | 
   206       }else {
 | 
| 
 | 
   207          my @s = @{$sref};
 | 
| 
 | 
   208          print join("\t", @s), "\t$gref->[3]\tframeshift\n";
 | 
| 
 | 
   209       }
 | 
| 
 | 
   210       $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
 | 
| 
 | 
   211       return;
 | 
| 
 | 
   212    }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
 | 
| 
 | 
   213       my $copy = $sref->[$col3];
 | 
| 
 | 
   214       my $c = ($copy =~ tr/\[ACTG]+//);
 | 
| 
 | 
   215       if ($c % 3 == 0) { return; } #not frameshift
 | 
| 
 | 
   216       #handle bed4 or any interval file
 | 
| 
 | 
   217       if (!$keep) {
 | 
| 
 | 
   218          print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
 | 
| 
 | 
   219          print "\t$gref->[3]\tframeshift\n";
 | 
| 
 | 
   220       }else {
 | 
| 
 | 
   221          my @s = @{$sref};
 | 
| 
 | 
   222          print join("\t", @s), "\t$gref->[3]\tframeshift\n";
 | 
| 
 | 
   223       }
 | 
| 
 | 
   224       $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
 | 
| 
 | 
   225       return;
 | 
| 
 | 
   226    }elsif ($sref->[$col3] =~ /-/) { #indel and sub?
 | 
| 
 | 
   227       return; #skip
 | 
| 
 | 
   228    }
 | 
| 
 | 
   229    #check for amino acid substitutions
 | 
| 
 | 
   230    my $s = $sref->[$col1] + 1;
 | 
| 
 | 
   231    my $e = $sref->[$col2];
 | 
| 
 | 
   232    my $len = $sref->[$col2] - $sref->[$col1];
 | 
| 
 | 
   233    if ($gref->[5] eq '-') { 
 | 
| 
 | 
   234       @pos = reverse(@pos); 
 | 
| 
 | 
   235       my $t = $s;
 | 
| 
 | 
   236       $s = $e;
 | 
| 
 | 
   237       $e = $t;
 | 
| 
 | 
   238    }
 | 
| 
 | 
   239    $i = 0;
 | 
| 
 | 
   240    my $found = 0;
 | 
| 
 | 
   241    foreach (@pos) {
 | 
| 
 | 
   242       if ($s == $_) {
 | 
| 
 | 
   243          $found = 1;
 | 
| 
 | 
   244          last;
 | 
| 
 | 
   245       }
 | 
| 
 | 
   246       $i++;
 | 
| 
 | 
   247    }
 | 
| 
 | 
   248    if ($found) {
 | 
| 
 | 
   249       my $fs = $i; #keep original start index
 | 
| 
 | 
   250       #have index where substitution starts
 | 
| 
 | 
   251       my $cp = $i % 3; 
 | 
| 
 | 
   252       $i -= $cp; #i is now first position in codon
 | 
| 
 | 
   253       my $cdNum = int($i / 3) + 1;
 | 
| 
 | 
   254       my $ls = $i;
 | 
| 
 | 
   255       if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
 | 
| 
 | 
   256       if (!@pos) { die "ERROR not defined array pos\n"; }
 | 
| 
 | 
   257       if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
 | 
| 
 | 
   258       if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
 | 
| 
 | 
   259       while ($ls <= $#pos && $pos[$ls] ne $e) { 
 | 
| 
 | 
   260          $ls++; 
 | 
| 
 | 
   261       }
 | 
| 
 | 
   262       my $i2 = $ls + (2 - ($ls % 3));
 | 
| 
 | 
   263       if ($i2 > $#pos) { return; } #not a full codon, partial gene?
 | 
| 
 | 
   264 
 | 
| 
 | 
   265       if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
 | 
| 
 | 
   266       my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
 | 
| 
 | 
   267       if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
 | 
| 
 | 
   268       my @vars = split(/\//, $sref->[$col3]);
 | 
| 
 | 
   269       if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
 | 
| 
 | 
   270          $oldnts = compl($oldnts);
 | 
| 
 | 
   271          if (!$oldnts) { return; } #skip this one
 | 
| 
 | 
   272          $oldnts = join('', (reverse(split(/ */, $oldnts))));
 | 
| 
 | 
   273          foreach (@vars) {
 | 
| 
 | 
   274             $_ = reverse(split(/ */)); #needed for indels
 | 
| 
 | 
   275             $_ = compl($_);
 | 
| 
 | 
   276          }
 | 
| 
 | 
   277       }
 | 
| 
 | 
   278       my $r = $fs - $i; #difference in old indexes gives new index
 | 
| 
 | 
   279       my @newnts;
 | 
| 
 | 
   280       my $changed = '';
 | 
| 
 | 
   281       foreach my $v (@vars) {
 | 
| 
 | 
   282          if (!$v or length($v) != 1) { return; } #only simple changes
 | 
| 
 | 
   283          my @new = split(/ */, $oldnts);
 | 
| 
 | 
   284          $changed = splice(@new, $r, $len, split(/ */, $v));
 | 
| 
 | 
   285          #should only change single nt
 | 
| 
 | 
   286          push(@newnts, join("", @new));
 | 
| 
 | 
   287       }
 | 
| 
 | 
   288       #now compute amino acids
 | 
| 
 | 
   289       my $oldaa = getaa($oldnts);
 | 
| 
 | 
   290       my @newaa;
 | 
| 
 | 
   291       my $change = 0; #flag for if there is a change
 | 
| 
 | 
   292       foreach my $v (@newnts) {
 | 
| 
 | 
   293          my $t = getaa($v);
 | 
| 
 | 
   294          if ($t ne $oldaa) { $change = 1; }
 | 
| 
 | 
   295          push(@newaa, $t);
 | 
| 
 | 
   296       }
 | 
| 
 | 
   297       if (!$change && $syn) { 
 | 
| 
 | 
   298           if (!$keep) {
 | 
| 
 | 
   299              print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
 | 
| 
 | 
   300              print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
 | 
| 
 | 
   301           }else {
 | 
| 
 | 
   302              my @s = @{$sref};
 | 
| 
 | 
   303              print join("\t", @s), 
 | 
| 
 | 
   304                    "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
 | 
| 
 | 
   305           }
 | 
| 
 | 
   306           return;
 | 
| 
 | 
   307       }elsif ($syn) { return; } #only want synonymous changes
 | 
| 
 | 
   308       if (!$change) { return; } #no change in amino acids
 | 
| 
 | 
   309       if (!$keep) {
 | 
| 
 | 
   310          print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
 | 
| 
 | 
   311          if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
 | 
| 
 | 
   312          if (!$changed) { return; } #skip this one
 | 
| 
 | 
   313          print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
 | 
| 
 | 
   314       }else {
 | 
| 
 | 
   315          my @s = @{$sref};
 | 
| 
 | 
   316          print join("\t", @s);
 | 
| 
 | 
   317          if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
 | 
| 
 | 
   318          if (!$changed) { return; } #skip this one
 | 
| 
 | 
   319          print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
 | 
| 
 | 
   320       }
 | 
| 
 | 
   321       $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
 | 
| 
 | 
   322    }
 | 
| 
 | 
   323 }
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 sub getnts {
 | 
| 
 | 
   326    my $chr = shift;
 | 
| 
 | 
   327    my @pos = @_; #list of positions not necessarily in order
 | 
| 
 | 
   328    #list may be reversed or have gaps(introns), at least 3 bps
 | 
| 
 | 
   329    my $seq = '';
 | 
| 
 | 
   330    if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
 | 
| 
 | 
   331    if ($pos[0] < $pos[1]) { #not reversed
 | 
| 
 | 
   332       my $s = $pos[0];
 | 
| 
 | 
   333       for(my $i = 1; $i <= $#pos; $i++) {
 | 
| 
 | 
   334          if ($pos[$i] == $pos[$i-1] + 1) { next; }
 | 
| 
 | 
   335          if ($seqFlag eq '2bit') { 
 | 
| 
 | 
   336             $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
 | 
| 
 | 
   337          }else {
 | 
| 
 | 
   338             $seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
 | 
| 
 | 
   339          }
 | 
| 
 | 
   340          $s = $pos[$i];
 | 
| 
 | 
   341       }
 | 
| 
 | 
   342       if (length $seq != scalar @pos) { #still need to fetch seq
 | 
| 
 | 
   343          if ($seqFlag eq '2bit') {
 | 
| 
 | 
   344             $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
 | 
| 
 | 
   345          }else {
 | 
| 
 | 
   346             $seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
 | 
| 
 | 
   347          }
 | 
| 
 | 
   348       }
 | 
| 
 | 
   349    }else { #reversed
 | 
| 
 | 
   350       my $s = $pos[$#pos];
 | 
| 
 | 
   351       for(my $i = $#pos -1; $i >= 0; $i--) {
 | 
| 
 | 
   352          if ($pos[$i] == $pos[$i+1] + 1) { next; }
 | 
| 
 | 
   353          if ($seqFlag eq '2bit') {
 | 
| 
 | 
   354             $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
 | 
| 
 | 
   355          }else {
 | 
| 
 | 
   356             $seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
 | 
| 
 | 
   357          }
 | 
| 
 | 
   358          $s = $pos[$i];
 | 
| 
 | 
   359       }
 | 
| 
 | 
   360       if (length $seq != scalar @pos) { #still need to fetch seq
 | 
| 
 | 
   361          if ($seqFlag eq '2bit') {
 | 
| 
 | 
   362             $seq .= fetchSeq2bit($chr, $s, $pos[0]);
 | 
| 
 | 
   363          }else {
 | 
| 
 | 
   364             $seq .= fetchSeqNib($chr, $s, $pos[0]);
 | 
| 
 | 
   365          }
 | 
| 
 | 
   366       }
 | 
| 
 | 
   367    }
 | 
| 
 | 
   368 }
 | 
| 
 | 
   369 
 | 
| 
 | 
   370 sub fetchSeq2bit {
 | 
| 
 | 
   371    my $chr = shift;
 | 
| 
 | 
   372    my $st = shift;
 | 
| 
 | 
   373    my $end = shift;
 | 
| 
 | 
   374    my $strand = '+';
 | 
| 
 | 
   375    $st--; #change to UCSC numbering
 | 
| 
 | 
   376    open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or
 | 
| 
 | 
   377       die "Couldn't run twoBitToFa, $!\n";
 | 
| 
 | 
   378    my $seq = '';
 | 
| 
 | 
   379    while (<BIT>) {
 | 
| 
 | 
   380       chomp;
 | 
| 
 | 
   381       if (/^>/) { next; } #header
 | 
| 
 | 
   382       $seq .= uc($_);
 | 
| 
 | 
   383    }
 | 
| 
 | 
   384    close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n";
 | 
| 
 | 
   385    return $seq;
 | 
| 
 | 
   386 }
 | 
| 
 | 
   387 
 | 
| 
 | 
   388 sub fetchSeqNib {
 | 
| 
 | 
   389    my $chr = shift;
 | 
| 
 | 
   390    my $st = shift;
 | 
| 
 | 
   391    my $end = shift;
 | 
| 
 | 
   392    my $strand = '+';
 | 
| 
 | 
   393    $st--; #change to UCSC numbering
 | 
| 
 | 
   394    open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
 | 
| 
 | 
   395    my $seq = '';
 | 
| 
 | 
   396    while (<NIB>) {
 | 
| 
 | 
   397       chomp;
 | 
| 
 | 
   398       if (/^>/) { next; } #header
 | 
| 
 | 
   399       $seq .= $_;
 | 
| 
 | 
   400    }
 | 
| 
 | 
   401    close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
 | 
| 
 | 
   402    return $seq;
 | 
| 
 | 
   403 }
 | 
| 
 | 
   404 
 | 
| 
 | 
   405 sub compl {
 | 
| 
 | 
   406    my $nts = shift;
 | 
| 
 | 
   407    my $comp = '';
 | 
| 
 | 
   408    if (!$nts) { die "ERROR called compl with nts undefined"; }
 | 
| 
 | 
   409    foreach my $n (split(/ */, $nts)) {
 | 
| 
 | 
   410       if ($n eq 'A') { $comp .= 'T'; }
 | 
| 
 | 
   411       elsif ($n eq 'T') { $comp .= 'A'; }
 | 
| 
 | 
   412       elsif ($n eq 'C') { $comp .= 'G'; }
 | 
| 
 | 
   413       elsif ($n eq 'G') { $comp .= 'C'; }
 | 
| 
 | 
   414       elsif ($n eq 'N') { $comp .= 'N'; }
 | 
| 
 | 
   415       elsif ($n eq '-') { $comp .= '-'; } #deletion
 | 
| 
 | 
   416       else { $comp = undef; }
 | 
| 
 | 
   417    }
 | 
| 
 | 
   418    return $comp;
 | 
| 
 | 
   419 }
 | 
| 
 | 
   420 
 | 
| 
 | 
   421 sub getaa {
 | 
| 
 | 
   422    my $nts = shift;  #in multiples of 3
 | 
| 
 | 
   423    my $aa = '';
 | 
| 
 | 
   424    my @n = split(/ */, $nts);
 | 
| 
 | 
   425    while (@n) {
 | 
| 
 | 
   426       my @t = splice(@n, 0, 3);
 | 
| 
 | 
   427       my $n = uc(join("", @t));
 | 
| 
 | 
   428       if (!exists $codon{$n}) { $aa .= 'N'; next; }
 | 
| 
 | 
   429       $aa .= $codon{$n};
 | 
| 
 | 
   430    }
 | 
| 
 | 
   431    return $aa;
 | 
| 
 | 
   432 }
 | 
| 
 | 
   433 
 | 
| 
 | 
   434 sub fill_codon {
 | 
| 
 | 
   435 $codon{GCA} = 'Ala';
 | 
| 
 | 
   436 $codon{GCC} = 'Ala';
 | 
| 
 | 
   437 $codon{GCG} = 'Ala';
 | 
| 
 | 
   438 $codon{GCT} = 'Ala';
 | 
| 
 | 
   439 $codon{CGG} = 'Arg';
 | 
| 
 | 
   440 $codon{CGT} = 'Arg';
 | 
| 
 | 
   441 $codon{CGC} = 'Arg';
 | 
| 
 | 
   442 $codon{AGA} = 'Arg';
 | 
| 
 | 
   443 $codon{AGG} = 'Arg';
 | 
| 
 | 
   444 $codon{CGA} = 'Arg';
 | 
| 
 | 
   445 $codon{AAC} = 'Asn';
 | 
| 
 | 
   446 $codon{AAT} = 'Asn';
 | 
| 
 | 
   447 $codon{GAC} = 'Asp';
 | 
| 
 | 
   448 $codon{GAT} = 'Asp';
 | 
| 
 | 
   449 $codon{TGC} = 'Cys';
 | 
| 
 | 
   450 $codon{TGT} = 'Cys';
 | 
| 
 | 
   451 $codon{CAG} = 'Gln';
 | 
| 
 | 
   452 $codon{CAA} = 'Gln';
 | 
| 
 | 
   453 $codon{GAA} = 'Glu';
 | 
| 
 | 
   454 $codon{GAG} = 'Glu';
 | 
| 
 | 
   455 $codon{GGG} = 'Gly';
 | 
| 
 | 
   456 $codon{GGA} = 'Gly';
 | 
| 
 | 
   457 $codon{GGC} = 'Gly';
 | 
| 
 | 
   458 $codon{GGT} = 'Gly';
 | 
| 
 | 
   459 $codon{CAC} = 'His';
 | 
| 
 | 
   460 $codon{CAT} = 'His';
 | 
| 
 | 
   461 $codon{ATA} = 'Ile';
 | 
| 
 | 
   462 $codon{ATT} = 'Ile';
 | 
| 
 | 
   463 $codon{ATC} = 'Ile';
 | 
| 
 | 
   464 $codon{CTA} = 'Leu';
 | 
| 
 | 
   465 $codon{CTC} = 'Leu';
 | 
| 
 | 
   466 $codon{CTG} = 'Leu';
 | 
| 
 | 
   467 $codon{CTT} = 'Leu';
 | 
| 
 | 
   468 $codon{TTG} = 'Leu';
 | 
| 
 | 
   469 $codon{TTA} = 'Leu';
 | 
| 
 | 
   470 $codon{AAA} = 'Lys';
 | 
| 
 | 
   471 $codon{AAG} = 'Lys';
 | 
| 
 | 
   472 $codon{ATG} = 'Met';
 | 
| 
 | 
   473 $codon{TTC} = 'Phe';
 | 
| 
 | 
   474 $codon{TTT} = 'Phe';
 | 
| 
 | 
   475 $codon{CCT} = 'Pro';
 | 
| 
 | 
   476 $codon{CCA} = 'Pro';
 | 
| 
 | 
   477 $codon{CCC} = 'Pro';
 | 
| 
 | 
   478 $codon{CCG} = 'Pro';
 | 
| 
 | 
   479 $codon{TCA} = 'Ser';
 | 
| 
 | 
   480 $codon{AGC} = 'Ser';
 | 
| 
 | 
   481 $codon{AGT} = 'Ser';
 | 
| 
 | 
   482 $codon{TCC} = 'Ser';
 | 
| 
 | 
   483 $codon{TCT} = 'Ser';
 | 
| 
 | 
   484 $codon{TCG} = 'Ser';
 | 
| 
 | 
   485 $codon{TGA} = 'Stop';
 | 
| 
 | 
   486 $codon{TAG} = 'Stop';
 | 
| 
 | 
   487 $codon{TAA} = 'Stop';
 | 
| 
 | 
   488 $codon{ACT} = 'Thr';
 | 
| 
 | 
   489 $codon{ACA} = 'Thr';
 | 
| 
 | 
   490 $codon{ACC} = 'Thr';
 | 
| 
 | 
   491 $codon{ACG} = 'Thr';
 | 
| 
 | 
   492 $codon{TGG} = 'Trp';
 | 
| 
 | 
   493 $codon{TAT} = 'Tyr';
 | 
| 
 | 
   494 $codon{TAC} = 'Tyr';
 | 
| 
 | 
   495 $codon{GTC} = 'Val';
 | 
| 
 | 
   496 $codon{GTA} = 'Val';
 | 
| 
 | 
   497 $codon{GTG} = 'Val';
 | 
| 
 | 
   498 $codon{GTT} = 'Val';
 | 
| 
 | 
   499 }
 | 
| 
 | 
   500 
 | 
| 
 | 
   501 sub getGalaxyInfo {
 | 
| 
 | 
   502    my $build;
 | 
| 
 | 
   503    my $locFile;
 | 
| 
 | 
   504    foreach (@ARGV) {
 | 
| 
 | 
   505       if (/build=(.*)/) { $build = $1; }
 | 
| 
 | 
   506       elsif (/loc=(.*)/) { $locFile = $1; }
 | 
| 
 | 
   507    }
 | 
| 
 | 
   508    if (!$build or !$locFile) {
 | 
| 
 | 
   509       print STDERR "ERROR missing build or locfile for Galaxy input\n";
 | 
| 
 | 
   510       exit 1;
 | 
| 
 | 
   511    }
 | 
| 
 | 
   512    # read $locFile to get $nibDir (ignoring commets)
 | 
| 
 | 
   513    open(LF, "< $locFile") || die "open($locFile): $!\n";
 | 
| 
 | 
   514    while(<LF>) {
 | 
| 
 | 
   515       s/#.*$//;
 | 
| 
 | 
   516       s/(?:^\s+|\s+$)//g;
 | 
| 
 | 
   517       next if (/^$/);
 | 
| 
 | 
   518    
 | 
| 
 | 
   519       my @t = split(/\t/);
 | 
| 
 | 
   520       if ($t[0] eq $build) { $nibDir = $t[1]; }
 | 
| 
 | 
   521    }
 | 
| 
 | 
   522    close(LF);
 | 
| 
 | 
   523    if ($nibDir eq 'Galaxy') {
 | 
| 
 | 
   524       print STDERR "Failed to find sequence directory in locfile $locFile\n";
 | 
| 
 | 
   525    }
 | 
| 
 | 
   526    $nibDir .= "/$build.2bit";  #we want full path and filename
 | 
| 
 | 
   527 }
 | 
| 
 | 
   528 
 |