Mercurial > repos > cpt > cpt_psm_prep
comparison lib/CPT/Bio/ORF.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:50:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e4de0a0e90c8 | 1:d724f34e671d |
|---|---|
| 1 package CPT::Bio::ORF; | |
| 2 use strict; | |
| 3 use warnings; | |
| 4 use autodie; | |
| 5 use Moose; | |
| 6 | |
| 7 has min_gene_length => ( | |
| 8 is => 'rw', | |
| 9 isa => 'Int', | |
| 10 default => sub { | |
| 11 0 | |
| 12 }, | |
| 13 ); | |
| 14 has sc_atg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
| 15 has sc_ttg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
| 16 has sc_ctg => ( is => 'rw', isa => 'Bool', default => sub { 0 } ); | |
| 17 has sc_gtg => ( is => 'rw', isa => 'Bool', default => sub { 1 } ); | |
| 18 | |
| 19 our %code = ( | |
| 20 "TTT" => "F", "TTC" => "F", "TTA" => "L", "TTG" => "L", "TCT" => "S", | |
| 21 "TCC" => "S", "TCA" => "S", "TCG" => "S", "TAT" => "Y", "TAC" => "Y", | |
| 22 "TAA" => "*", "TAG" => "*", "TGT" => "C", "TGC" => "C", "TGA" => "*", | |
| 23 "TGG" => "W", "CTT" => "L", "CTC" => "L", "CTA" => "L", "CTG" => "L", | |
| 24 "CCT" => "P", "CCC" => "P", "CCA" => "P", "CCG" => "P", "CAT" => "H", | |
| 25 "CAC" => "H", "CAA" => "Q", "CAG" => "Q", "CGT" => "R", "CGC" => "R", | |
| 26 "CGA" => "R", "CGG" => "R", "ATT" => "I", "ATC" => "I", "ATA" => "I", | |
| 27 "ATG" => "M", "ACT" => "T", "ACC" => "T", "ACA" => "T", "ACG" => "T", | |
| 28 "AAT" => "N", "AAC" => "N", "AAA" => "K", "AAG" => "K", "AGT" => "S", | |
| 29 "AGC" => "S", "AGA" => "R", "AGG" => "R", "GTT" => "V", "GTC" => "V", | |
| 30 "GTA" => "V", "GTG" => "V", "GCT" => "A", "GCC" => "A", "GCA" => "A", | |
| 31 "GCG" => "A", "GAT" => "D", "GAC" => "D", "GAA" => "E", "GAG" => "E", | |
| 32 "GGT" => "G", "GGC" => "G", "GGA" => "G", "GGG" => "G", | |
| 33 ); | |
| 34 | |
| 35 | |
| 36 | |
| 37 sub run { | |
| 38 my ($self, $sequence) = @_; | |
| 39 # Read through forward strand | |
| 40 my @putative_starts; | |
| 41 | |
| 42 # 30 seconds with a bioperl object | |
| 43 # 5 seconds with string munging. >:| | |
| 44 my $dna = uc( $sequence ); | |
| 45 my $length = length($sequence); | |
| 46 | |
| 47 # Pre-create the regular expressions | |
| 48 my ( $regex_forward, $regex_backwards ); | |
| 49 my $not_statement_f = '^'; | |
| 50 my $not_statement_r = '^'; | |
| 51 if ( !$self->sc_atg() ) { | |
| 52 $not_statement_f .= 'A'; | |
| 53 $not_statement_r .= 'T'; | |
| 54 } | |
| 55 if ( !$self->sc_ctg() ) { | |
| 56 $not_statement_f .= 'C'; | |
| 57 $not_statement_r .= 'G'; | |
| 58 } | |
| 59 if ( !$self->sc_ttg() ) { | |
| 60 $not_statement_f .= 'T'; | |
| 61 $not_statement_r .= 'A'; | |
| 62 } | |
| 63 if ( !$self->sc_gtg() ) { | |
| 64 $not_statement_f .= 'G'; | |
| 65 $not_statement_r .= 'C'; | |
| 66 } | |
| 67 | |
| 68 # If any start is acceptable, we re-add them and remove our ^ | |
| 69 if($not_statement_r eq '^' && $not_statement_f eq '^'){ | |
| 70 $not_statement_f = 'ACTG'; | |
| 71 $not_statement_r = 'ACTG'; | |
| 72 } | |
| 73 $regex_forward = qr/[${not_statement_f}]TG/; | |
| 74 $regex_backwards = qr/CA[${not_statement_r}]/; | |
| 75 | |
| 76 # Collect putative starts | |
| 77 for ( my $i = 1 ; $i < $length - 1 ; $i++ ) { | |
| 78 my $tri_nt = substr( $dna, $i - 1, 3 ); #$seq_obj->subseq($i,$i+2); | |
| 79 if ( $tri_nt =~ $regex_forward ) { | |
| 80 push( @putative_starts, [ $i, '+' ] ); | |
| 81 } | |
| 82 if ( $tri_nt =~ $regex_backwards ) { | |
| 83 push( @putative_starts, [ $i + 2, '-' ] ); | |
| 84 } | |
| 85 } | |
| 86 my %ORFs; | |
| 87 | |
| 88 #Loop through all of the starts we have | |
| 89 my $fc = 0; | |
| 90 my $rc = 0; | |
| 91 foreach (@putative_starts) { | |
| 92 my @putative_start = @{$_}; | |
| 93 | |
| 94 my $final_seq = ""; | |
| 95 | |
| 96 my $add; | |
| 97 my $tri_nt; | |
| 98 if ( $putative_start[1] eq "+" ) { | |
| 99 my $end; | |
| 100 for ( my $k = $putative_start[0] ; $k < $length ; $k = $k + 3 ) | |
| 101 { | |
| 102 my $tri_nt = substr( $dna, $k, 3 ); | |
| 103 my $aa = $code{$tri_nt}; | |
| 104 if ( $aa && $aa ne '*' ) { | |
| 105 $end = $k + 3; | |
| 106 $final_seq .= $tri_nt; | |
| 107 } | |
| 108 else { | |
| 109 last; | |
| 110 } | |
| 111 } | |
| 112 if ( length($final_seq)/3 > $self->min_gene_length() ) { | |
| 113 $ORFs{ 'f_' + $fc++ } = [ | |
| 114 length($final_seq)/3, | |
| 115 $putative_start[0], | |
| 116 $end, | |
| 117 'F', | |
| 118 $final_seq | |
| 119 ]; | |
| 120 } | |
| 121 } # - strand | |
| 122 else { | |
| 123 my $end; | |
| 124 for ( my $k = $putative_start[0] ; $k >= 2 ; $k = $k - 3 ) { | |
| 125 my $tmp = reverse( substr( $dna, $k - 3, 3 ) ); | |
| 126 | |
| 127 $tmp =~ tr/ACTG/qzAC/; | |
| 128 $tmp =~ tr/qz/TG/; | |
| 129 | |
| 130 my $aa = $code{$tmp}; | |
| 131 if ( defined $aa && $aa ne '*' ) { | |
| 132 $end = $k - 1; | |
| 133 $final_seq .= $tmp; | |
| 134 } | |
| 135 else { | |
| 136 last; | |
| 137 } | |
| 138 | |
| 139 } | |
| 140 if ( length($final_seq)/3 > $self->min_gene_length() ) { | |
| 141 $ORFs{ 'r_' + $rc++ } = [ | |
| 142 length($final_seq)/3, | |
| 143 $end, | |
| 144 $putative_start[0], | |
| 145 'R', | |
| 146 $final_seq | |
| 147 ]; | |
| 148 } | |
| 149 } | |
| 150 } | |
| 151 | |
| 152 my @orfs; | |
| 153 | |
| 154 for my $orf_key ( sort( keys(%ORFs) ) ) { | |
| 155 my @tmp= @{ $ORFs{$orf_key} }; | |
| 156 my $seqobj = Bio::Seq->new( | |
| 157 -display_id => sprintf( | |
| 158 'orf%05d_%s', | |
| 159 ($orf_key + 1), $tmp[3], | |
| 160 ), | |
| 161 -desc => sprintf( | |
| 162 '[%s-%s; %s aa long]' | |
| 163 ,$tmp[1], $tmp[2], $tmp[0] | |
| 164 ), | |
| 165 -seq => $tmp[4] | |
| 166 ); | |
| 167 push(@orfs, $seqobj); | |
| 168 } | |
| 169 return @orfs; | |
| 170 } | |
| 171 | |
| 172 | |
| 173 | |
| 174 no Moose; | |
| 175 1; | |
| 176 | |
| 177 __END__ | |
| 178 | |
| 179 =pod | |
| 180 | |
| 181 =encoding UTF-8 | |
| 182 | |
| 183 =head1 NAME | |
| 184 | |
| 185 CPT::Bio::ORF | |
| 186 | |
| 187 =head1 VERSION | |
| 188 | |
| 189 version 1.99.4 | |
| 190 | |
| 191 =function run | |
| 192 | |
| 193 =head1 AUTHOR | |
| 194 | |
| 195 Eric Rasche <rasche.eric@yandex.ru> | |
| 196 | |
| 197 =head1 COPYRIGHT AND LICENSE | |
| 198 | |
| 199 This software is Copyright (c) 2014 by Eric Rasche. | |
| 200 | |
| 201 This is free software, licensed under: | |
| 202 | |
| 203 The GNU General Public License, Version 3, June 2007 | |
| 204 | |
| 205 =cut |
