comparison lib/CPT/Bio.pm @ 1:8691c1c61a8e draft default tip

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:48:47 +0000
parents
children
comparison
equal deleted inserted replaced
0:54c7a3ea81e2 1:8691c1c61a8e
1 package CPT::Bio;
2 use Moose;
3 use strict;
4 use warnings;
5 use autodie;
6 use CPT::FiletypeDetector;
7 use CPT::BioData;
8 my $bd = CPT::BioData->new();
9
10 my $filetype = CPT::FiletypeDetector->new();
11
12 has 'var_translate' => ( is => 'rw', isa => 'Bool');
13 has 'var_header' => ( is => 'rw', isa => 'Bool');
14 has codonTable => (
15 is => 'rw',
16 isa => 'Any',
17 default => sub {
18 $bd->getTranslationTable(11)
19 },
20 );
21
22 sub set_codon_table {
23 my ($self, $num) = @_;
24 $self->codonTable($bd->getTranslationTable($num));
25 }
26
27
28 sub _getFeatureTag {
29 my ( $self, $feat, $tag ) = @_;
30 if(! defined($feat)){
31 warn "Undefined feature";
32 }
33 return $feat->has_tag($tag)
34 ? ( join( ',', $feat->get_tag_values($tag) ) )
35 : '';
36 }
37
38 sub _getIdentifier {
39 my ( $self, $feat ) = @_;
40 my $line;
41 if ( ref $feat eq 'Bio::Seq::RichSeq' || ref $feat eq 'Bio::Seq' ) {
42 return $feat->display_id;
43 }
44 else {
45 my $locus_tag = $self->_getFeatureTag( $feat, 'locus_tag' );
46 if ($locus_tag) {
47 return $locus_tag;
48 }
49 my $gene = $self->_getFeatureTag( $feat, 'gene' );
50 if ($gene) {
51 return $gene;
52 }
53 my $product = $self->_getFeatureTag( $feat, 'product' );
54 if ($product) {
55 return $product;
56 }
57 }
58 return sprintf("%s_%s_%s", $feat->start(), $feat->end(), ($feat->strand() == 1 ? 'sense':'antisense'));
59 }
60
61
62 sub requestCopy {
63 my ( $self, %data ) = @_;
64 use Bio::SeqIO;
65 if ($data{'file'} ) {
66 my ($guessed_type) = $filetype->detect( $data{'file'} );
67 my $seqio = Bio::SeqIO->new(
68 -file => $data{'file'},
69 -format => $guessed_type
70 );
71 my @results;
72 while ( my $seqobj = $seqio->next_seq() ) {
73 return \$seqobj;
74 }
75 }
76 else {
77 die "No file specified";
78 }
79 }
80
81
82 sub getSeqIO {
83 my ( $self, $file ) = @_;
84 use Bio::SeqIO;
85 if ($file ) {
86 my ($guessed_type) = $filetype->detect( $file );
87 my $seqio = Bio::SeqIO->new(
88 -file => $file,
89 -format => $guessed_type
90 );
91 return $seqio;
92 }
93 else {
94 die "No file specified";
95 }
96 }
97
98
99 sub parseFile {
100 my ( $self, %data ) = @_;
101 use Bio::SeqIO;
102
103 my ($guessed_type) = $filetype->detect( $data{'file'} );
104 my $seqio = Bio::SeqIO->new(
105 -file => $data{'file'},
106 -format => $guessed_type
107 );
108
109 # Are we to translate this
110 $self->var_translate(defined($data{translate}) && $data{translate});
111 $self->var_header(defined($data{header}) && $data{header});
112
113 my @results;
114 if ( not defined $data{'subset'} ) {
115 $data{'subset'} = 'all';
116 }
117 while ( my $seqobj = $seqio->next_seq() ) {
118 if (
119 (ref $data{'subset'} ne 'ARRAY'
120 && $data{'subset'} eq 'whole' ) # Want the whole thing for a richseq
121 ||
122 (ref $seqobj eq 'Bio::Seq' || ref $seqobj eq 'Bio::Seq::fasta')
123 # or it's a fasta type sequence
124 )
125 {
126 push( @results, $self->handle_seq($seqobj));
127 }
128 else #data subset eq sometag
129 {
130 my %wanted_tags;
131 if ( ref $data{'subset'} eq 'ARRAY' ) {
132 %wanted_tags =
133 map { $_ => 1 } @{ $data{'subset'} };
134 }
135 else {
136 $wanted_tags{ $data{'subset'} }++;
137 }
138 foreach my $feat ( $seqobj->get_SeqFeatures ) {
139 if (
140 $wanted_tags{ $feat->primary_tag }
141 || ( $wanted_tags{'all'}
142 && $feat->primary_tag ne
143 "source" )
144 )
145 {
146 push( @results, $self->handle_seq($feat));
147 }
148 }
149 }
150 }
151 if ( $data{'callback'} ) {
152 $data{'callback'}->( \@results );
153 }
154 else {
155 return \@results;
156 }
157 }
158
159 sub handle_seq {
160 my ($self, $obj) = @_;
161
162 my @line;
163 if ( $self->var_header() ){
164 $line[0] = '>' . $self->_getIdentifier($obj);
165 }
166
167 # Get our sequence
168 $line[1] = $self->intelligent_get_seq($obj);
169
170 if ( $self->var_translate() ) {
171 $line[1] = $self->translate($line[1]);
172 }
173 return \@line;
174 }
175
176 sub intelligent_get_seq {
177 my ($self, $obj, %extra) = @_;
178 # Top level, e.g., fasta/gbk file, "extra" doesn't apply to these
179 if ( ref $obj eq 'Bio::Seq::RichSeq' || ref $obj eq 'Bio::Seq' ) {
180 return $obj->seq;
181 }else{
182 return $self->get_seq_from_feature($obj, %extra);
183 }
184 }
185 sub get_seq_from_feature {
186 my ($self, $feat, %extra) = @_;
187 my $seq;
188 my $l;
189 if($extra{parent}){
190 $l = $extra{parent}->length();
191 }
192
193 if($extra{upstream}){
194 if($feat->strand < 0){
195 my $y = $feat->end + 1;
196 my $z = $feat->end + $extra{upstream};
197 if($y < $l){
198 if($z > $l){
199 $z = $l;
200 }
201 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
202 }
203 }else{
204 my $y = $feat->start - $extra{upstream};
205 my $z = $feat->start - 1;
206 if($z > 0){
207 if($y < 1){
208 $y = 1;
209 }
210 $seq .= $extra{parent}->trunc($y, $z)->seq;
211 }
212 }
213 }
214 if(ref($feat->location) eq 'Bio::Location::Simple'){
215 $seq .= $feat->seq->seq();
216 }else{
217 $seq .= $feat->spliced_seq->seq();
218 }
219 if($extra{downstream}){
220 if($feat->strand < 0){
221 my $y = $feat->start - $extra{downstream};
222 my $z = $feat->start - 1;
223 if($z > 0){
224 if($y < 1){
225 $y = 1;
226 }
227 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
228 }
229 }else{
230 my $y = $feat->end + 1;
231 my $z = $feat->end + $extra{downstream};
232 if($y < $l){
233 if($z > $l){
234 $z = $l;
235 }
236 $seq .= $extra{parent}->trunc($y, $z)->seq;
237 }
238 }
239 }
240
241 return $seq;
242 }
243
244
245 sub translate {
246 my ($self, $seq) = @_;
247 if($seq =~ /^[ACTGN]+$/){
248 my %ct = %{$self->codonTable};
249 $seq = join( '' , map { if($ct{$_}){ $ct{$_} }else{ () } } unpack("(A3)*", $seq));
250 }
251 return $seq;
252 }
253
254 no Moose;
255 1;
256
257 __END__
258
259 =pod
260
261 =encoding UTF-8
262
263 =head1 NAME
264
265 CPT::Bio
266
267 =head1 VERSION
268
269 version 1.99.4
270
271 =head2 _getFeatureTag
272
273 my $tag = $libCPT->_getFeatureTag($feature,'note');
274
275 returns all values of the given tag, joined with ','.
276
277 =head2 requestCopy
278
279 my $seqobj = $libCPT->requestCopy('file'=>'test.gbk');
280
281 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
282
283 =head2 getSeqIO
284
285 my $seqio = $libCPT->getSeqIO('file'=>'test.gbk');
286
287 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
288
289 =head2 parseFile
290
291 $libCPT->parseFile(
292 'file' => $options{'file'},
293 'callback' => \&func,
294 'translate' => 1,
295 'header' => 1,
296 'subset' => ['CDS', $options{'tag'}],
297 );
298
299 Arguably the most important function in this library, wraps a lot of functionality in a clean wrapper, since most of the scripts we have are written around data munging.
300
301 =over 4
302
303 =item *
304
305 file - the Bio::SeqIO file to process
306
307 =item *
308
309 callback - the function to send our data to. Done all at once, in an array
310
311 =item *
312
313 translate - should we translate the sequence to amino acids if it's not already.
314
315 =item *
316
317 subset - either "whole", a valid tag, or an array of valid tags
318
319 =item *
320
321 header - Do we want a header (FASTA) with our result set
322
323 =back
324
325 =head1 AUTHOR
326
327 Eric Rasche <rasche.eric@yandex.ru>
328
329 =head1 COPYRIGHT AND LICENSE
330
331 This software is Copyright (c) 2014 by Eric Rasche.
332
333 This is free software, licensed under:
334
335 The GNU General Public License, Version 3, June 2007
336
337 =cut