annotate tools/evolution/codingSnps.pl @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/perl -w
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 use strict;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 #########################################################################
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # codingSnps.pl
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # This takes a bed file with the names being / separated nts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # and a gene bed file with cds start and stop.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # It then checks for changes in coding regions, reporting
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # those that cause a frameshift or substitution in the amino acid.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 #########################################################################
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 if (!@ARGV or scalar @ARGV < 3) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# keepColumns=1] > codingSnps.txt\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 exit;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 my $uniq = 0; #flag for whether want uniq positions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 my $syn = 0; #flag for if want synonomous changes rather than non-syn
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 my $keep = 0; #keep old columns and append new ones
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 my $snpFile = shift @ARGV;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 my $geneFile = shift @ARGV;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 my $nibDir = shift @ARGV; #2bit or nib, depending on flag above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 if ($nibDir eq 'Galaxy') { getGalaxyInfo(); }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 my $col0 = 0; #bed like columns in default positions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 my $col1 = 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 my $col2 = 2;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 my $col3 = 3;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 #column positions 1 based coming in (for Galaxy)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 foreach (@ARGV) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 if (/chr=(\d+)/) { $col0 = $1 -1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 elsif (/start=(\d+)/) { $col1 = $1 -1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 elsif (/end=(\d+)/) { $col2 = $1 -1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 elsif (/snp=(\d+)/) { $col3 = $1 -1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 elsif (/keepColumns=1/) { $keep = 1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 print STDERR "ERROR column numbers are given with origin 1\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 my @genes; #bed lines for genes, sorted by chrom and start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 my %chrSt; #index in array where each chrom starts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 my %codon; #hash of codon amino acid conversions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 my $ignoreN = 1; #skip N
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 my %amb = (
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 "R" => "A/G",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 "Y" => "C/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 "S" => "C/G",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 "W" => "A/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 "K" => "G/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 "M" => "A/C",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 "B" => "C/G/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 "D" => "A/G/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 "H" => "A/C/T",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 "V" => "A/C/G",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 "N" => "A/C/G/T"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 );
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 fill_codon();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 open(FH, "cat $geneFile | sort -k1,1 -k2,2n |")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 or die "Couldn't open and sort $geneFile, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 my $i = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 while(<FH>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 chomp;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 push(@genes, "$_");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 my @f = split(/\t/);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 $i++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 close FH or die "Couldn't close $geneFile, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 #open snps sorted as well
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 my $s1 = $col0 + 1; #sort order is origin 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 my $s2 = $col1 + 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 or die "Couldn't open and sort $snpFile, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 $i = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 my @g; #one genes fields, should be used repeatedly
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 my %done;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 while(<FH>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 chomp;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 if (/^\s*#/) { next; } #comment
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 my @s = split(/\t/); #SNP fields
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 my $size = $#s;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 if ($s[$col1] =~ /\D/) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 if ($s[$col2] =~ /\D/) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if ($s[$col3] eq 'N' && $ignoreN) { next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 $i = $chrSt{$s[$col0]};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 @g = split(/\t/, $genes[$i]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 if (scalar @g < 12) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 print STDERR "ERROR the gene file must be the whole genes in BED format\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 }elsif (!@g) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 next; #no gene for this chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 $i = $chrSt{$s[$col0]};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 @g = split(/\t/, $genes[$i]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 }elsif ($s[$col0] ne $g[0]) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 next; #no gene for this chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 next; #before any genes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 next; #after all genes on chr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 while ($s[$col1] > $g[2] && $i < $#genes) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 $i++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 @g = split(/\t/, $genes[$i]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 if ($s[$col0] ne $g[0]) { last; } #end of gene
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 next; #no overlap with genes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 processSnp(\@s, \@g);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 my $k = $i + 1; #check for more genes without losing data of first
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 if ($k <= $#genes) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 my @g2 = split(/\t/, $genes[$k]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 while (@g2 && $k <= $#genes) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 @g2 = split(/\t/, $genes[$k]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 if ($s[$col0] ne $g2[0]) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 undef @g2;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 last; #not same chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 while ($s[$col1] > $g2[2] && $k <= $#genes) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 $k++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 @g2 = split(/\t/, $genes[$k]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 if ($s[$col0] ne $g2[0]) { last; } #end of chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 undef @g2;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 last; #no overlap with more genes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 processSnp(\@s, \@g2);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 $k++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 close FH or die "Couldn't close $snpFile, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 exit;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 ########################################################################
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 sub processSnp {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 my $sref = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 my $gref = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 #overlaps gene, but maybe not coding seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 #inside cds
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 return; #outside of coding
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 #now check exon
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 my $i = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 my @st = split(/,/, $gref->[11]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 my @size = split(/,/, $gref->[10]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 my @pos;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 my $in = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 for($i = 0; $i < $gref->[9]; $i++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 my $sta = $gref->[1] + $st[$i] + 1; #1 based position
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 my $end = $sta + $size[$i] - 1; #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 if ($end < $gref->[6]) { next; } #utr only
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 if ($sta > $gref->[7]) { next; } #utr only
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 #shorten to coding only
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 if ($end > $gref->[7]) { $end = $gref->[7]; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 push(@pos, ($sta .. $end)); #add exon worth of positions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 #@pos has coding positions for whole gene (chr coors),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 #and $in has whether we need to continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 if (!$in) { return; } #not in coding exon
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 $sref->[$col3] =~ /^-+$/) { #indel or del
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 my $copy = $sref->[$col3];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 my $c = ($copy =~ tr/-//);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 if ($c % 3 == 0) { return; } #not frameshift
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 #handle bed4 or any interval file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 if (!$keep) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 print "\t$gref->[3]\tframeshift\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 my @s = @{$sref};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 print join("\t", @s), "\t$gref->[3]\tframeshift\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 return;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 my $copy = $sref->[$col3];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 my $c = ($copy =~ tr/\[ACTG]+//);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 if ($c % 3 == 0) { return; } #not frameshift
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 #handle bed4 or any interval file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 if (!$keep) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 print "\t$gref->[3]\tframeshift\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 my @s = @{$sref};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 print join("\t", @s), "\t$gref->[3]\tframeshift\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 return;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 }elsif ($sref->[$col3] =~ /-/) { #indel and sub?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 return; #skip
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 #check for amino acid substitutions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 my $s = $sref->[$col1] + 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 my $e = $sref->[$col2];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 my $len = $sref->[$col2] - $sref->[$col1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233 if ($gref->[5] eq '-') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 @pos = reverse(@pos);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235 my $t = $s;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 $s = $e;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 $e = $t;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 $i = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 my $found = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241 foreach (@pos) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 if ($s == $_) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243 $found = 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 last;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 $i++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 if ($found) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249 my $fs = $i; #keep original start index
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 #have index where substitution starts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251 my $cp = $i % 3;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 $i -= $cp; #i is now first position in codon
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 my $cdNum = int($i / 3) + 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254 my $ls = $i;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255 if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 if (!@pos) { die "ERROR not defined array pos\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 while ($ls <= $#pos && $pos[$ls] ne $e) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 $ls++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262 my $i2 = $ls + (2 - ($ls % 3));
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263 if ($i2 > $#pos) { return; } #not a full codon, partial gene?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268 my @vars = split(/\//, $sref->[$col3]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270 $oldnts = compl($oldnts);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271 if (!$oldnts) { return; } #skip this one
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272 $oldnts = join('', (reverse(split(/ */, $oldnts))));
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273 foreach (@vars) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 $_ = reverse(split(/ */)); #needed for indels
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 $_ = compl($_);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278 my $r = $fs - $i; #difference in old indexes gives new index
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 my @newnts;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 my $changed = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281 foreach my $v (@vars) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282 if (!$v or length($v) != 1) { return; } #only simple changes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283 my @new = split(/ */, $oldnts);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 $changed = splice(@new, $r, $len, split(/ */, $v));
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285 #should only change single nt
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 push(@newnts, join("", @new));
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 #now compute amino acids
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289 my $oldaa = getaa($oldnts);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290 my @newaa;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
291 my $change = 0; #flag for if there is a change
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
292 foreach my $v (@newnts) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
293 my $t = getaa($v);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
294 if ($t ne $oldaa) { $change = 1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
295 push(@newaa, $t);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
296 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
297 if (!$change && $syn) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
298 if (!$keep) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
299 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
300 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
301 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
302 my @s = @{$sref};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
303 print join("\t", @s),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
304 "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
305 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
306 return;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
307 }elsif ($syn) { return; } #only want synonymous changes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
308 if (!$change) { return; } #no change in amino acids
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
309 if (!$keep) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
310 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
311 if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
312 if (!$changed) { return; } #skip this one
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
313 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
314 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
315 my @s = @{$sref};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
316 print join("\t", @s);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
317 if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
318 if (!$changed) { return; } #skip this one
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
319 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
320 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
321 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
322 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
323 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
324
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
325 sub getnts {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
326 my $chr = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
327 my @pos = @_; #list of positions not necessarily in order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
328 #list may be reversed or have gaps(introns), at least 3 bps
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
329 my $seq = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
330 if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
331 if ($pos[0] < $pos[1]) { #not reversed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
332 my $s = $pos[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
333 for(my $i = 1; $i <= $#pos; $i++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
334 if ($pos[$i] == $pos[$i-1] + 1) { next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
335 if ($seqFlag eq '2bit') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
336 $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
337 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
338 $seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
339 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
340 $s = $pos[$i];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
341 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
342 if (length $seq != scalar @pos) { #still need to fetch seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
343 if ($seqFlag eq '2bit') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
344 $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
345 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
346 $seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
347 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
348 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
349 }else { #reversed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
350 my $s = $pos[$#pos];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
351 for(my $i = $#pos -1; $i >= 0; $i--) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
352 if ($pos[$i] == $pos[$i+1] + 1) { next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
353 if ($seqFlag eq '2bit') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
354 $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
355 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
356 $seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
357 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
358 $s = $pos[$i];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
359 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
360 if (length $seq != scalar @pos) { #still need to fetch seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
361 if ($seqFlag eq '2bit') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
362 $seq .= fetchSeq2bit($chr, $s, $pos[0]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
363 }else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
364 $seq .= fetchSeqNib($chr, $s, $pos[0]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
365 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
366 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
367 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
368 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
369
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
370 sub fetchSeq2bit {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
371 my $chr = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
372 my $st = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
373 my $end = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
374 my $strand = '+';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
375 $st--; #change to UCSC numbering
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
376 open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
377 die "Couldn't run twoBitToFa, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
378 my $seq = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
379 while (<BIT>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
380 chomp;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
381 if (/^>/) { next; } #header
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
382 $seq .= uc($_);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
383 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
384 close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
385 return $seq;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
386 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
387
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
388 sub fetchSeqNib {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
389 my $chr = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
390 my $st = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
391 my $end = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
392 my $strand = '+';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
393 $st--; #change to UCSC numbering
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
394 open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
395 my $seq = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
396 while (<NIB>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
397 chomp;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
398 if (/^>/) { next; } #header
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
399 $seq .= $_;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
400 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
401 close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
402 return $seq;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
403 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
404
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
405 sub compl {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
406 my $nts = shift;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
407 my $comp = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
408 if (!$nts) { die "ERROR called compl with nts undefined"; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
409 foreach my $n (split(/ */, $nts)) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
410 if ($n eq 'A') { $comp .= 'T'; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
411 elsif ($n eq 'T') { $comp .= 'A'; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
412 elsif ($n eq 'C') { $comp .= 'G'; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
413 elsif ($n eq 'G') { $comp .= 'C'; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
414 elsif ($n eq 'N') { $comp .= 'N'; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
415 elsif ($n eq '-') { $comp .= '-'; } #deletion
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
416 else { $comp = undef; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
417 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
418 return $comp;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
419 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
420
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
421 sub getaa {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
422 my $nts = shift; #in multiples of 3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
423 my $aa = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
424 my @n = split(/ */, $nts);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
425 while (@n) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
426 my @t = splice(@n, 0, 3);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
427 my $n = uc(join("", @t));
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
428 if (!exists $codon{$n}) { $aa .= 'N'; next; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
429 $aa .= $codon{$n};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
430 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
431 return $aa;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
432 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
433
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
434 sub fill_codon {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
435 $codon{GCA} = 'Ala';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
436 $codon{GCC} = 'Ala';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
437 $codon{GCG} = 'Ala';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
438 $codon{GCT} = 'Ala';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
439 $codon{CGG} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
440 $codon{CGT} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
441 $codon{CGC} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
442 $codon{AGA} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
443 $codon{AGG} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
444 $codon{CGA} = 'Arg';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
445 $codon{AAC} = 'Asn';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
446 $codon{AAT} = 'Asn';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
447 $codon{GAC} = 'Asp';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
448 $codon{GAT} = 'Asp';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
449 $codon{TGC} = 'Cys';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
450 $codon{TGT} = 'Cys';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
451 $codon{CAG} = 'Gln';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
452 $codon{CAA} = 'Gln';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
453 $codon{GAA} = 'Glu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
454 $codon{GAG} = 'Glu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
455 $codon{GGG} = 'Gly';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
456 $codon{GGA} = 'Gly';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
457 $codon{GGC} = 'Gly';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
458 $codon{GGT} = 'Gly';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
459 $codon{CAC} = 'His';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
460 $codon{CAT} = 'His';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
461 $codon{ATA} = 'Ile';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
462 $codon{ATT} = 'Ile';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
463 $codon{ATC} = 'Ile';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
464 $codon{CTA} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
465 $codon{CTC} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
466 $codon{CTG} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
467 $codon{CTT} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
468 $codon{TTG} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
469 $codon{TTA} = 'Leu';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
470 $codon{AAA} = 'Lys';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
471 $codon{AAG} = 'Lys';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
472 $codon{ATG} = 'Met';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
473 $codon{TTC} = 'Phe';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
474 $codon{TTT} = 'Phe';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
475 $codon{CCT} = 'Pro';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
476 $codon{CCA} = 'Pro';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
477 $codon{CCC} = 'Pro';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
478 $codon{CCG} = 'Pro';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
479 $codon{TCA} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
480 $codon{AGC} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
481 $codon{AGT} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
482 $codon{TCC} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
483 $codon{TCT} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
484 $codon{TCG} = 'Ser';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
485 $codon{TGA} = 'Stop';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
486 $codon{TAG} = 'Stop';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
487 $codon{TAA} = 'Stop';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
488 $codon{ACT} = 'Thr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
489 $codon{ACA} = 'Thr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
490 $codon{ACC} = 'Thr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
491 $codon{ACG} = 'Thr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
492 $codon{TGG} = 'Trp';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
493 $codon{TAT} = 'Tyr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
494 $codon{TAC} = 'Tyr';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
495 $codon{GTC} = 'Val';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
496 $codon{GTA} = 'Val';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
497 $codon{GTG} = 'Val';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
498 $codon{GTT} = 'Val';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
499 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
500
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
501 sub getGalaxyInfo {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
502 my $build;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
503 my $locFile;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
504 foreach (@ARGV) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
505 if (/build=(.*)/) { $build = $1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
506 elsif (/loc=(.*)/) { $locFile = $1; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
507 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
508 if (!$build or !$locFile) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
509 print STDERR "ERROR missing build or locfile for Galaxy input\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
510 exit 1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
511 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
512 # read $locFile to get $nibDir (ignoring commets)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
513 open(LF, "< $locFile") || die "open($locFile): $!\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
514 while(<LF>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
515 s/#.*$//;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
516 s/(?:^\s+|\s+$)//g;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
517 next if (/^$/);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
518
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
519 my @t = split(/\t/);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
520 if ($t[0] eq $build) { $nibDir = $t[1]; }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
521 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
522 close(LF);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
523 if ($nibDir eq 'Galaxy') {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
524 print STDERR "Failed to find sequence directory in locfile $locFile\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
525 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
526 $nibDir .= "/$build.2bit"; #we want full path and filename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
527 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
528