0
|
1 package CPT::Bio::ORF;
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use autodie;
|
|
5 use Moose;
|
|
6
|
|
7 has min_gene_length => (
|
|
8 is => 'rw',
|
|
9 isa => 'Int',
|
|
10 default => sub {
|
|
11 0
|
|
12 },
|
|
13 );
|
|
14 has sc_atg => ( is => 'rw', isa => 'Bool', default => sub { 1 } );
|
|
15 has sc_ttg => ( is => 'rw', isa => 'Bool', default => sub { 1 } );
|
|
16 has sc_ctg => ( is => 'rw', isa => 'Bool', default => sub { 0 } );
|
|
17 has sc_gtg => ( is => 'rw', isa => 'Bool', default => sub { 1 } );
|
|
18
|
|
19 our %code = (
|
|
20 "TTT" => "F", "TTC" => "F", "TTA" => "L", "TTG" => "L", "TCT" => "S",
|
|
21 "TCC" => "S", "TCA" => "S", "TCG" => "S", "TAT" => "Y", "TAC" => "Y",
|
|
22 "TAA" => "*", "TAG" => "*", "TGT" => "C", "TGC" => "C", "TGA" => "*",
|
|
23 "TGG" => "W", "CTT" => "L", "CTC" => "L", "CTA" => "L", "CTG" => "L",
|
|
24 "CCT" => "P", "CCC" => "P", "CCA" => "P", "CCG" => "P", "CAT" => "H",
|
|
25 "CAC" => "H", "CAA" => "Q", "CAG" => "Q", "CGT" => "R", "CGC" => "R",
|
|
26 "CGA" => "R", "CGG" => "R", "ATT" => "I", "ATC" => "I", "ATA" => "I",
|
|
27 "ATG" => "M", "ACT" => "T", "ACC" => "T", "ACA" => "T", "ACG" => "T",
|
|
28 "AAT" => "N", "AAC" => "N", "AAA" => "K", "AAG" => "K", "AGT" => "S",
|
|
29 "AGC" => "S", "AGA" => "R", "AGG" => "R", "GTT" => "V", "GTC" => "V",
|
|
30 "GTA" => "V", "GTG" => "V", "GCT" => "A", "GCC" => "A", "GCA" => "A",
|
|
31 "GCG" => "A", "GAT" => "D", "GAC" => "D", "GAA" => "E", "GAG" => "E",
|
|
32 "GGT" => "G", "GGC" => "G", "GGA" => "G", "GGG" => "G",
|
|
33 );
|
|
34
|
|
35
|
|
36
|
|
37 sub run {
|
|
38 my ($self, $sequence) = @_;
|
|
39 # Read through forward strand
|
|
40 my @putative_starts;
|
|
41
|
|
42 # 30 seconds with a bioperl object
|
|
43 # 5 seconds with string munging. >:|
|
|
44 my $dna = uc( $sequence );
|
|
45 my $length = length($sequence);
|
|
46
|
|
47 # Pre-create the regular expressions
|
|
48 my ( $regex_forward, $regex_backwards );
|
|
49 my $not_statement_f = '^';
|
|
50 my $not_statement_r = '^';
|
|
51 if ( !$self->sc_atg() ) {
|
|
52 $not_statement_f .= 'A';
|
|
53 $not_statement_r .= 'T';
|
|
54 }
|
|
55 if ( !$self->sc_ctg() ) {
|
|
56 $not_statement_f .= 'C';
|
|
57 $not_statement_r .= 'G';
|
|
58 }
|
|
59 if ( !$self->sc_ttg() ) {
|
|
60 $not_statement_f .= 'T';
|
|
61 $not_statement_r .= 'A';
|
|
62 }
|
|
63 if ( !$self->sc_gtg() ) {
|
|
64 $not_statement_f .= 'G';
|
|
65 $not_statement_r .= 'C';
|
|
66 }
|
|
67
|
|
68 # If any start is acceptable, we re-add them and remove our ^
|
|
69 if($not_statement_r eq '^' && $not_statement_f eq '^'){
|
|
70 $not_statement_f = 'ACTG';
|
|
71 $not_statement_r = 'ACTG';
|
|
72 }
|
|
73 $regex_forward = qr/[${not_statement_f}]TG/;
|
|
74 $regex_backwards = qr/CA[${not_statement_r}]/;
|
|
75
|
|
76 # Collect putative starts
|
|
77 for ( my $i = 1 ; $i < $length - 1 ; $i++ ) {
|
|
78 my $tri_nt = substr( $dna, $i - 1, 3 ); #$seq_obj->subseq($i,$i+2);
|
|
79 if ( $tri_nt =~ $regex_forward ) {
|
|
80 push( @putative_starts, [ $i, '+' ] );
|
|
81 }
|
|
82 if ( $tri_nt =~ $regex_backwards ) {
|
|
83 push( @putative_starts, [ $i + 2, '-' ] );
|
|
84 }
|
|
85 }
|
|
86 my %ORFs;
|
|
87
|
|
88 #Loop through all of the starts we have
|
|
89 my $fc = 0;
|
|
90 my $rc = 0;
|
|
91 foreach (@putative_starts) {
|
|
92 my @putative_start = @{$_};
|
|
93
|
|
94 my $final_seq = "";
|
|
95
|
|
96 my $add;
|
|
97 my $tri_nt;
|
|
98 if ( $putative_start[1] eq "+" ) {
|
|
99 my $end;
|
|
100 for ( my $k = $putative_start[0] ; $k < $length ; $k = $k + 3 )
|
|
101 {
|
|
102 my $tri_nt = substr( $dna, $k, 3 );
|
|
103 my $aa = $code{$tri_nt};
|
|
104 if ( $aa && $aa ne '*' ) {
|
|
105 $end = $k + 3;
|
|
106 $final_seq .= $tri_nt;
|
|
107 }
|
|
108 else {
|
|
109 last;
|
|
110 }
|
|
111 }
|
|
112 if ( length($final_seq)/3 > $self->min_gene_length() ) {
|
|
113 $ORFs{ 'f_' + $fc++ } = [
|
|
114 length($final_seq)/3,
|
|
115 $putative_start[0],
|
|
116 $end,
|
|
117 'F',
|
|
118 $final_seq
|
|
119 ];
|
|
120 }
|
|
121 } # - strand
|
|
122 else {
|
|
123 my $end;
|
|
124 for ( my $k = $putative_start[0] ; $k >= 2 ; $k = $k - 3 ) {
|
|
125 my $tmp = reverse( substr( $dna, $k - 3, 3 ) );
|
|
126
|
|
127 $tmp =~ tr/ACTG/qzAC/;
|
|
128 $tmp =~ tr/qz/TG/;
|
|
129
|
|
130 my $aa = $code{$tmp};
|
|
131 if ( defined $aa && $aa ne '*' ) {
|
|
132 $end = $k - 1;
|
|
133 $final_seq .= $tmp;
|
|
134 }
|
|
135 else {
|
|
136 last;
|
|
137 }
|
|
138
|
|
139 }
|
|
140 if ( length($final_seq)/3 > $self->min_gene_length() ) {
|
|
141 $ORFs{ 'r_' + $rc++ } = [
|
|
142 length($final_seq)/3,
|
|
143 $end,
|
|
144 $putative_start[0],
|
|
145 'R',
|
|
146 $final_seq
|
|
147 ];
|
|
148 }
|
|
149 }
|
|
150 }
|
|
151
|
|
152 my @orfs;
|
|
153
|
|
154 for my $orf_key ( sort( keys(%ORFs) ) ) {
|
|
155 my @tmp= @{ $ORFs{$orf_key} };
|
|
156 my $seqobj = Bio::Seq->new(
|
|
157 -display_id => sprintf(
|
|
158 'orf%05d_%s',
|
|
159 ($orf_key + 1), $tmp[3],
|
|
160 ),
|
|
161 -desc => sprintf(
|
|
162 '[%s-%s; %s aa long]'
|
|
163 ,$tmp[1], $tmp[2], $tmp[0]
|
|
164 ),
|
|
165 -seq => $tmp[4]
|
|
166 );
|
|
167 push(@orfs, $seqobj);
|
|
168 }
|
|
169 return @orfs;
|
|
170 }
|
|
171
|
|
172
|
|
173
|
|
174 no Moose;
|
|
175 1;
|
|
176
|
|
177 __END__
|
|
178
|
|
179 =pod
|
|
180
|
|
181 =encoding UTF-8
|
|
182
|
|
183 =head1 NAME
|
|
184
|
|
185 CPT::Bio::ORF
|
|
186
|
|
187 =head1 VERSION
|
|
188
|
|
189 version 1.99.4
|
|
190
|
|
191 =function run
|
|
192
|
|
193 =head1 AUTHOR
|
|
194
|
|
195 Eric Rasche <rasche.eric@yandex.ru>
|
|
196
|
|
197 =head1 COPYRIGHT AND LICENSE
|
|
198
|
|
199 This software is Copyright (c) 2014 by Eric Rasche.
|
|
200
|
|
201 This is free software, licensed under:
|
|
202
|
|
203 The GNU General Public License, Version 3, June 2007
|
|
204
|
|
205 =cut
|