Mercurial > repos > cpt > cpt_psm_prep
comparison lib/CPT/Bio.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e4de0a0e90c8 | 1:d724f34e671d |
---|---|
1 package CPT::Bio; | |
2 use Moose; | |
3 use strict; | |
4 use warnings; | |
5 use autodie; | |
6 use CPT::FiletypeDetector; | |
7 use CPT::BioData; | |
8 my $bd = CPT::BioData->new(); | |
9 | |
10 my $filetype = CPT::FiletypeDetector->new(); | |
11 | |
12 has 'var_translate' => ( is => 'rw', isa => 'Bool'); | |
13 has 'var_header' => ( is => 'rw', isa => 'Bool'); | |
14 has codonTable => ( | |
15 is => 'rw', | |
16 isa => 'Any', | |
17 default => sub { | |
18 $bd->getTranslationTable(11) | |
19 }, | |
20 ); | |
21 | |
22 sub set_codon_table { | |
23 my ($self, $num) = @_; | |
24 $self->codonTable($bd->getTranslationTable($num)); | |
25 } | |
26 | |
27 | |
28 sub _getFeatureTag { | |
29 my ( $self, $feat, $tag ) = @_; | |
30 if(! defined($feat)){ | |
31 warn "Undefined feature"; | |
32 } | |
33 return $feat->has_tag($tag) | |
34 ? ( join( ',', $feat->get_tag_values($tag) ) ) | |
35 : ''; | |
36 } | |
37 | |
38 sub _getIdentifier { | |
39 my ( $self, $feat ) = @_; | |
40 my $line; | |
41 if ( ref $feat eq 'Bio::Seq::RichSeq' || ref $feat eq 'Bio::Seq' ) { | |
42 return $feat->display_id; | |
43 } | |
44 else { | |
45 my $locus_tag = $self->_getFeatureTag( $feat, 'locus_tag' ); | |
46 if ($locus_tag) { | |
47 return $locus_tag; | |
48 } | |
49 my $gene = $self->_getFeatureTag( $feat, 'gene' ); | |
50 if ($gene) { | |
51 return $gene; | |
52 } | |
53 my $product = $self->_getFeatureTag( $feat, 'product' ); | |
54 if ($product) { | |
55 return $product; | |
56 } | |
57 } | |
58 return sprintf("%s_%s_%s", $feat->start(), $feat->end(), ($feat->strand() == 1 ? 'sense':'antisense')); | |
59 } | |
60 | |
61 | |
62 sub requestCopy { | |
63 my ( $self, %data ) = @_; | |
64 use Bio::SeqIO; | |
65 if ($data{'file'} ) { | |
66 my ($guessed_type) = $filetype->detect( $data{'file'} ); | |
67 my $seqio = Bio::SeqIO->new( | |
68 -file => $data{'file'}, | |
69 -format => $guessed_type | |
70 ); | |
71 my @results; | |
72 while ( my $seqobj = $seqio->next_seq() ) { | |
73 return \$seqobj; | |
74 } | |
75 } | |
76 else { | |
77 die "No file specified"; | |
78 } | |
79 } | |
80 | |
81 | |
82 sub getSeqIO { | |
83 my ( $self, $file ) = @_; | |
84 use Bio::SeqIO; | |
85 if ($file ) { | |
86 my ($guessed_type) = $filetype->detect( $file ); | |
87 my $seqio = Bio::SeqIO->new( | |
88 -file => $file, | |
89 -format => $guessed_type | |
90 ); | |
91 return $seqio; | |
92 } | |
93 else { | |
94 die "No file specified"; | |
95 } | |
96 } | |
97 | |
98 | |
99 sub parseFile { | |
100 my ( $self, %data ) = @_; | |
101 use Bio::SeqIO; | |
102 | |
103 my ($guessed_type) = $filetype->detect( $data{'file'} ); | |
104 my $seqio = Bio::SeqIO->new( | |
105 -file => $data{'file'}, | |
106 -format => $guessed_type | |
107 ); | |
108 | |
109 # Are we to translate this | |
110 $self->var_translate(defined($data{translate}) && $data{translate}); | |
111 $self->var_header(defined($data{header}) && $data{header}); | |
112 | |
113 my @results; | |
114 if ( not defined $data{'subset'} ) { | |
115 $data{'subset'} = 'all'; | |
116 } | |
117 while ( my $seqobj = $seqio->next_seq() ) { | |
118 if ( | |
119 (ref $data{'subset'} ne 'ARRAY' | |
120 && $data{'subset'} eq 'whole' ) # Want the whole thing for a richseq | |
121 || | |
122 (ref $seqobj eq 'Bio::Seq' || ref $seqobj eq 'Bio::Seq::fasta') | |
123 # or it's a fasta type sequence | |
124 ) | |
125 { | |
126 push( @results, $self->handle_seq($seqobj)); | |
127 } | |
128 else #data subset eq sometag | |
129 { | |
130 my %wanted_tags; | |
131 if ( ref $data{'subset'} eq 'ARRAY' ) { | |
132 %wanted_tags = | |
133 map { $_ => 1 } @{ $data{'subset'} }; | |
134 } | |
135 else { | |
136 $wanted_tags{ $data{'subset'} }++; | |
137 } | |
138 foreach my $feat ( $seqobj->get_SeqFeatures ) { | |
139 if ( | |
140 $wanted_tags{ $feat->primary_tag } | |
141 || ( $wanted_tags{'all'} | |
142 && $feat->primary_tag ne | |
143 "source" ) | |
144 ) | |
145 { | |
146 push( @results, $self->handle_seq($feat)); | |
147 } | |
148 } | |
149 } | |
150 } | |
151 if ( $data{'callback'} ) { | |
152 $data{'callback'}->( \@results ); | |
153 } | |
154 else { | |
155 return \@results; | |
156 } | |
157 } | |
158 | |
159 sub handle_seq { | |
160 my ($self, $obj) = @_; | |
161 | |
162 my @line; | |
163 if ( $self->var_header() ){ | |
164 $line[0] = '>' . $self->_getIdentifier($obj); | |
165 } | |
166 | |
167 # Get our sequence | |
168 $line[1] = $self->intelligent_get_seq($obj); | |
169 | |
170 if ( $self->var_translate() ) { | |
171 $line[1] = $self->translate($line[1]); | |
172 } | |
173 return \@line; | |
174 } | |
175 | |
176 sub intelligent_get_seq { | |
177 my ($self, $obj, %extra) = @_; | |
178 # Top level, e.g., fasta/gbk file, "extra" doesn't apply to these | |
179 if ( ref $obj eq 'Bio::Seq::RichSeq' || ref $obj eq 'Bio::Seq' ) { | |
180 return $obj->seq; | |
181 }else{ | |
182 return $self->get_seq_from_feature($obj, %extra); | |
183 } | |
184 } | |
185 sub get_seq_from_feature { | |
186 my ($self, $feat, %extra) = @_; | |
187 my $seq; | |
188 my $l; | |
189 if($extra{parent}){ | |
190 $l = $extra{parent}->length(); | |
191 } | |
192 | |
193 if($extra{upstream}){ | |
194 if($feat->strand < 0){ | |
195 my $y = $feat->end + 1; | |
196 my $z = $feat->end + $extra{upstream}; | |
197 if($y < $l){ | |
198 if($z > $l){ | |
199 $z = $l; | |
200 } | |
201 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq; | |
202 } | |
203 }else{ | |
204 my $y = $feat->start - $extra{upstream}; | |
205 my $z = $feat->start - 1; | |
206 if($z > 0){ | |
207 if($y < 1){ | |
208 $y = 1; | |
209 } | |
210 $seq .= $extra{parent}->trunc($y, $z)->seq; | |
211 } | |
212 } | |
213 } | |
214 if(ref($feat->location) eq 'Bio::Location::Simple'){ | |
215 $seq .= $feat->seq->seq(); | |
216 }else{ | |
217 $seq .= $feat->spliced_seq->seq(); | |
218 } | |
219 if($extra{downstream}){ | |
220 if($feat->strand < 0){ | |
221 my $y = $feat->start - $extra{downstream}; | |
222 my $z = $feat->start - 1; | |
223 if($z > 0){ | |
224 if($y < 1){ | |
225 $y = 1; | |
226 } | |
227 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq; | |
228 } | |
229 }else{ | |
230 my $y = $feat->end + 1; | |
231 my $z = $feat->end + $extra{downstream}; | |
232 if($y < $l){ | |
233 if($z > $l){ | |
234 $z = $l; | |
235 } | |
236 $seq .= $extra{parent}->trunc($y, $z)->seq; | |
237 } | |
238 } | |
239 } | |
240 | |
241 return $seq; | |
242 } | |
243 | |
244 | |
245 sub translate { | |
246 my ($self, $seq) = @_; | |
247 if($seq =~ /^[ACTGN]+$/){ | |
248 my %ct = %{$self->codonTable}; | |
249 $seq = join( '' , map { if($ct{$_}){ $ct{$_} }else{ () } } unpack("(A3)*", $seq)); | |
250 } | |
251 return $seq; | |
252 } | |
253 | |
254 no Moose; | |
255 1; | |
256 | |
257 __END__ | |
258 | |
259 =pod | |
260 | |
261 =encoding UTF-8 | |
262 | |
263 =head1 NAME | |
264 | |
265 CPT::Bio | |
266 | |
267 =head1 VERSION | |
268 | |
269 version 1.99.4 | |
270 | |
271 =head2 _getFeatureTag | |
272 | |
273 my $tag = $libCPT->_getFeatureTag($feature,'note'); | |
274 | |
275 returns all values of the given tag, joined with ','. | |
276 | |
277 =head2 requestCopy | |
278 | |
279 my $seqobj = $libCPT->requestCopy('file'=>'test.gbk'); | |
280 | |
281 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file. | |
282 | |
283 =head2 getSeqIO | |
284 | |
285 my $seqio = $libCPT->getSeqIO('file'=>'test.gbk'); | |
286 | |
287 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file. | |
288 | |
289 =head2 parseFile | |
290 | |
291 $libCPT->parseFile( | |
292 'file' => $options{'file'}, | |
293 'callback' => \&func, | |
294 'translate' => 1, | |
295 'header' => 1, | |
296 'subset' => ['CDS', $options{'tag'}], | |
297 ); | |
298 | |
299 Arguably the most important function in this library, wraps a lot of functionality in a clean wrapper, since most of the scripts we have are written around data munging. | |
300 | |
301 =over 4 | |
302 | |
303 =item * | |
304 | |
305 file - the Bio::SeqIO file to process | |
306 | |
307 =item * | |
308 | |
309 callback - the function to send our data to. Done all at once, in an array | |
310 | |
311 =item * | |
312 | |
313 translate - should we translate the sequence to amino acids if it's not already. | |
314 | |
315 =item * | |
316 | |
317 subset - either "whole", a valid tag, or an array of valid tags | |
318 | |
319 =item * | |
320 | |
321 header - Do we want a header (FASTA) with our result set | |
322 | |
323 =back | |
324 | |
325 =head1 AUTHOR | |
326 | |
327 Eric Rasche <rasche.eric@yandex.ru> | |
328 | |
329 =head1 COPYRIGHT AND LICENSE | |
330 | |
331 This software is Copyright (c) 2014 by Eric Rasche. | |
332 | |
333 This is free software, licensed under: | |
334 | |
335 The GNU General Public License, Version 3, June 2007 | |
336 | |
337 =cut |