Mercurial > repos > cpt > cpt_psm_prep
comparison lib/CPT/Bio/ORF.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e4de0a0e90c8 | 1:d724f34e671d |
---|---|
1 package CPT::Bio::ORF; | |
2 use strict; | |
3 use warnings; | |
4 use autodie; | |
5 use Moose; | |
6 | |
7 has min_gene_length => ( | |
8 is => 'rw', | |
9 isa => 'Int', | |
10 default => sub { | |
11 0 | |
12 }, | |
13 ); | |
14 has sc_atg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
15 has sc_ttg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
16 has sc_ctg => ( is => 'rw', isa => 'Bool', default => sub { 0 } ); | |
17 has sc_gtg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
18 | |
19 our %code = ( | |
20 "TTT" => "F", "TTC" => "F", "TTA" => "L", "TTG" => "L", "TCT" => "S", | |
21 "TCC" => "S", "TCA" => "S", "TCG" => "S", "TAT" => "Y", "TAC" => "Y", | |
22 "TAA" => "*", "TAG" => "*", "TGT" => "C", "TGC" => "C", "TGA" => "*", | |
23 "TGG" => "W", "CTT" => "L", "CTC" => "L", "CTA" => "L", "CTG" => "L", | |
24 "CCT" => "P", "CCC" => "P", "CCA" => "P", "CCG" => "P", "CAT" => "H", | |
25 "CAC" => "H", "CAA" => "Q", "CAG" => "Q", "CGT" => "R", "CGC" => "R", | |
26 "CGA" => "R", "CGG" => "R", "ATT" => "I", "ATC" => "I", "ATA" => "I", | |
27 "ATG" => "M", "ACT" => "T", "ACC" => "T", "ACA" => "T", "ACG" => "T", | |
28 "AAT" => "N", "AAC" => "N", "AAA" => "K", "AAG" => "K", "AGT" => "S", | |
29 "AGC" => "S", "AGA" => "R", "AGG" => "R", "GTT" => "V", "GTC" => "V", | |
30 "GTA" => "V", "GTG" => "V", "GCT" => "A", "GCC" => "A", "GCA" => "A", | |
31 "GCG" => "A", "GAT" => "D", "GAC" => "D", "GAA" => "E", "GAG" => "E", | |
32 "GGT" => "G", "GGC" => "G", "GGA" => "G", "GGG" => "G", | |
33 ); | |
34 | |
35 | |
36 | |
37 sub run { | |
38 my ($self, $sequence) = @_; | |
39 # Read through forward strand | |
40 my @putative_starts; | |
41 | |
42 # 30 seconds with a bioperl object | |
43 # 5 seconds with string munging. >:| | |
44 my $dna = uc( $sequence ); | |
45 my $length = length($sequence); | |
46 | |
47 # Pre-create the regular expressions | |
48 my ( $regex_forward, $regex_backwards ); | |
49 my $not_statement_f = '^'; | |
50 my $not_statement_r = '^'; | |
51 if ( !$self->sc_atg() ) { | |
52 $not_statement_f .= 'A'; | |
53 $not_statement_r .= 'T'; | |
54 } | |
55 if ( !$self->sc_ctg() ) { | |
56 $not_statement_f .= 'C'; | |
57 $not_statement_r .= 'G'; | |
58 } | |
59 if ( !$self->sc_ttg() ) { | |
60 $not_statement_f .= 'T'; | |
61 $not_statement_r .= 'A'; | |
62 } | |
63 if ( !$self->sc_gtg() ) { | |
64 $not_statement_f .= 'G'; | |
65 $not_statement_r .= 'C'; | |
66 } | |
67 | |
68 # If any start is acceptable, we re-add them and remove our ^ | |
69 if($not_statement_r eq '^' && $not_statement_f eq '^'){ | |
70 $not_statement_f = 'ACTG'; | |
71 $not_statement_r = 'ACTG'; | |
72 } | |
73 $regex_forward = qr/[${not_statement_f}]TG/; | |
74 $regex_backwards = qr/CA[${not_statement_r}]/; | |
75 | |
76 # Collect putative starts | |
77 for ( my $i = 1 ; $i < $length - 1 ; $i++ ) { | |
78 my $tri_nt = substr( $dna, $i - 1, 3 ); #$seq_obj->subseq($i,$i+2); | |
79 if ( $tri_nt =~ $regex_forward ) { | |
80 push( @putative_starts, [ $i, '+' ] ); | |
81 } | |
82 if ( $tri_nt =~ $regex_backwards ) { | |
83 push( @putative_starts, [ $i + 2, '-' ] ); | |
84 } | |
85 } | |
86 my %ORFs; | |
87 | |
88 #Loop through all of the starts we have | |
89 my $fc = 0; | |
90 my $rc = 0; | |
91 foreach (@putative_starts) { | |
92 my @putative_start = @{$_}; | |
93 | |
94 my $final_seq = ""; | |
95 | |
96 my $add; | |
97 my $tri_nt; | |
98 if ( $putative_start[1] eq "+" ) { | |
99 my $end; | |
100 for ( my $k = $putative_start[0] ; $k < $length ; $k = $k + 3 ) | |
101 { | |
102 my $tri_nt = substr( $dna, $k, 3 ); | |
103 my $aa = $code{$tri_nt}; | |
104 if ( $aa && $aa ne '*' ) { | |
105 $end = $k + 3; | |
106 $final_seq .= $tri_nt; | |
107 } | |
108 else { | |
109 last; | |
110 } | |
111 } | |
112 if ( length($final_seq)/3 > $self->min_gene_length() ) { | |
113 $ORFs{ 'f_' + $fc++ } = [ | |
114 length($final_seq)/3, | |
115 $putative_start[0], | |
116 $end, | |
117 'F', | |
118 $final_seq | |
119 ]; | |
120 } | |
121 } # - strand | |
122 else { | |
123 my $end; | |
124 for ( my $k = $putative_start[0] ; $k >= 2 ; $k = $k - 3 ) { | |
125 my $tmp = reverse( substr( $dna, $k - 3, 3 ) ); | |
126 | |
127 $tmp =~ tr/ACTG/qzAC/; | |
128 $tmp =~ tr/qz/TG/; | |
129 | |
130 my $aa = $code{$tmp}; | |
131 if ( defined $aa && $aa ne '*' ) { | |
132 $end = $k - 1; | |
133 $final_seq .= $tmp; | |
134 } | |
135 else { | |
136 last; | |
137 } | |
138 | |
139 } | |
140 if ( length($final_seq)/3 > $self->min_gene_length() ) { | |
141 $ORFs{ 'r_' + $rc++ } = [ | |
142 length($final_seq)/3, | |
143 $end, | |
144 $putative_start[0], | |
145 'R', | |
146 $final_seq | |
147 ]; | |
148 } | |
149 } | |
150 } | |
151 | |
152 my @orfs; | |
153 | |
154 for my $orf_key ( sort( keys(%ORFs) ) ) { | |
155 my @tmp= @{ $ORFs{$orf_key} }; | |
156 my $seqobj = Bio::Seq->new( | |
157 -display_id => sprintf( | |
158 'orf%05d_%s', | |
159 ($orf_key + 1), $tmp[3], | |
160 ), | |
161 -desc => sprintf( | |
162 '[%s-%s; %s aa long]' | |
163 ,$tmp[1], $tmp[2], $tmp[0] | |
164 ), | |
165 -seq => $tmp[4] | |
166 ); | |
167 push(@orfs, $seqobj); | |
168 } | |
169 return @orfs; | |
170 } | |
171 | |
172 | |
173 | |
174 no Moose; | |
175 1; | |
176 | |
177 __END__ | |
178 | |
179 =pod | |
180 | |
181 =encoding UTF-8 | |
182 | |
183 =head1 NAME | |
184 | |
185 CPT::Bio::ORF | |
186 | |
187 =head1 VERSION | |
188 | |
189 version 1.99.4 | |
190 | |
191 =function run | |
192 | |
193 =head1 AUTHOR | |
194 | |
195 Eric Rasche <rasche.eric@yandex.ru> | |
196 | |
197 =head1 COPYRIGHT AND LICENSE | |
198 | |
199 This software is Copyright (c) 2014 by Eric Rasche. | |
200 | |
201 This is free software, licensed under: | |
202 | |
203 The GNU General Public License, Version 3, June 2007 | |
204 | |
205 =cut |