0
|
1 package CPT::Bio;
|
|
2 use Moose;
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use autodie;
|
|
6 use CPT::FiletypeDetector;
|
|
7 use CPT::BioData;
|
|
8 my $bd = CPT::BioData->new();
|
|
9
|
|
10 my $filetype = CPT::FiletypeDetector->new();
|
|
11
|
|
12 has 'var_translate' => ( is => 'rw', isa => 'Bool');
|
|
13 has 'var_header' => ( is => 'rw', isa => 'Bool');
|
|
14 has codonTable => (
|
|
15 is => 'rw',
|
|
16 isa => 'Any',
|
|
17 default => sub {
|
|
18 $bd->getTranslationTable(11)
|
|
19 },
|
|
20 );
|
|
21
|
|
22 sub set_codon_table {
|
|
23 my ($self, $num) = @_;
|
|
24 $self->codonTable($bd->getTranslationTable($num));
|
|
25 }
|
|
26
|
|
27
|
|
28 sub _getFeatureTag {
|
|
29 my ( $self, $feat, $tag ) = @_;
|
|
30 if(! defined($feat)){
|
|
31 warn "Undefined feature";
|
|
32 }
|
|
33 return $feat->has_tag($tag)
|
|
34 ? ( join( ',', $feat->get_tag_values($tag) ) )
|
|
35 : '';
|
|
36 }
|
|
37
|
|
38 sub _getIdentifier {
|
|
39 my ( $self, $feat ) = @_;
|
|
40 my $line;
|
|
41 if ( ref $feat eq 'Bio::Seq::RichSeq' || ref $feat eq 'Bio::Seq' ) {
|
|
42 return $feat->display_id;
|
|
43 }
|
|
44 else {
|
|
45 my $locus_tag = $self->_getFeatureTag( $feat, 'locus_tag' );
|
|
46 if ($locus_tag) {
|
|
47 return $locus_tag;
|
|
48 }
|
|
49 my $gene = $self->_getFeatureTag( $feat, 'gene' );
|
|
50 if ($gene) {
|
|
51 return $gene;
|
|
52 }
|
|
53 my $product = $self->_getFeatureTag( $feat, 'product' );
|
|
54 if ($product) {
|
|
55 return $product;
|
|
56 }
|
|
57 }
|
|
58 return sprintf("%s_%s_%s", $feat->start(), $feat->end(), ($feat->strand() == 1 ? 'sense':'antisense'));
|
|
59 }
|
|
60
|
|
61
|
|
62 sub requestCopy {
|
|
63 my ( $self, %data ) = @_;
|
|
64 use Bio::SeqIO;
|
|
65 if ($data{'file'} ) {
|
|
66 my ($guessed_type) = $filetype->detect( $data{'file'} );
|
|
67 my $seqio = Bio::SeqIO->new(
|
|
68 -file => $data{'file'},
|
|
69 -format => $guessed_type
|
|
70 );
|
|
71 my @results;
|
|
72 while ( my $seqobj = $seqio->next_seq() ) {
|
|
73 return \$seqobj;
|
|
74 }
|
|
75 }
|
|
76 else {
|
|
77 die "No file specified";
|
|
78 }
|
|
79 }
|
|
80
|
|
81
|
|
82 sub getSeqIO {
|
|
83 my ( $self, $file ) = @_;
|
|
84 use Bio::SeqIO;
|
|
85 if ($file ) {
|
|
86 my ($guessed_type) = $filetype->detect( $file );
|
|
87 my $seqio = Bio::SeqIO->new(
|
|
88 -file => $file,
|
|
89 -format => $guessed_type
|
|
90 );
|
|
91 return $seqio;
|
|
92 }
|
|
93 else {
|
|
94 die "No file specified";
|
|
95 }
|
|
96 }
|
|
97
|
|
98
|
|
99 sub parseFile {
|
|
100 my ( $self, %data ) = @_;
|
|
101 use Bio::SeqIO;
|
|
102
|
|
103 my ($guessed_type) = $filetype->detect( $data{'file'} );
|
|
104 my $seqio = Bio::SeqIO->new(
|
|
105 -file => $data{'file'},
|
|
106 -format => $guessed_type
|
|
107 );
|
|
108
|
|
109 # Are we to translate this
|
|
110 $self->var_translate(defined($data{translate}) && $data{translate});
|
|
111 $self->var_header(defined($data{header}) && $data{header});
|
|
112
|
|
113 my @results;
|
|
114 if ( not defined $data{'subset'} ) {
|
|
115 $data{'subset'} = 'all';
|
|
116 }
|
|
117 while ( my $seqobj = $seqio->next_seq() ) {
|
|
118 if (
|
|
119 (ref $data{'subset'} ne 'ARRAY'
|
|
120 && $data{'subset'} eq 'whole' ) # Want the whole thing for a richseq
|
|
121 ||
|
|
122 (ref $seqobj eq 'Bio::Seq' || ref $seqobj eq 'Bio::Seq::fasta')
|
|
123 # or it's a fasta type sequence
|
|
124 )
|
|
125 {
|
|
126 push( @results, $self->handle_seq($seqobj));
|
|
127 }
|
|
128 else #data subset eq sometag
|
|
129 {
|
|
130 my %wanted_tags;
|
|
131 if ( ref $data{'subset'} eq 'ARRAY' ) {
|
|
132 %wanted_tags =
|
|
133 map { $_ => 1 } @{ $data{'subset'} };
|
|
134 }
|
|
135 else {
|
|
136 $wanted_tags{ $data{'subset'} }++;
|
|
137 }
|
|
138 foreach my $feat ( $seqobj->get_SeqFeatures ) {
|
|
139 if (
|
|
140 $wanted_tags{ $feat->primary_tag }
|
|
141 || ( $wanted_tags{'all'}
|
|
142 && $feat->primary_tag ne
|
|
143 "source" )
|
|
144 )
|
|
145 {
|
|
146 push( @results, $self->handle_seq($feat));
|
|
147 }
|
|
148 }
|
|
149 }
|
|
150 }
|
|
151 if ( $data{'callback'} ) {
|
|
152 $data{'callback'}->( \@results );
|
|
153 }
|
|
154 else {
|
|
155 return \@results;
|
|
156 }
|
|
157 }
|
|
158
|
|
159 sub handle_seq {
|
|
160 my ($self, $obj) = @_;
|
|
161
|
|
162 my @line;
|
|
163 if ( $self->var_header() ){
|
|
164 $line[0] = '>' . $self->_getIdentifier($obj);
|
|
165 }
|
|
166
|
|
167 # Get our sequence
|
|
168 $line[1] = $self->intelligent_get_seq($obj);
|
|
169
|
|
170 if ( $self->var_translate() ) {
|
|
171 $line[1] = $self->translate($line[1]);
|
|
172 }
|
|
173 return \@line;
|
|
174 }
|
|
175
|
|
176 sub intelligent_get_seq {
|
|
177 my ($self, $obj, %extra) = @_;
|
|
178 # Top level, e.g., fasta/gbk file, "extra" doesn't apply to these
|
|
179 if ( ref $obj eq 'Bio::Seq::RichSeq' || ref $obj eq 'Bio::Seq' ) {
|
|
180 return $obj->seq;
|
|
181 }else{
|
|
182 return $self->get_seq_from_feature($obj, %extra);
|
|
183 }
|
|
184 }
|
|
185 sub get_seq_from_feature {
|
|
186 my ($self, $feat, %extra) = @_;
|
|
187 my $seq;
|
|
188 my $l;
|
|
189 if($extra{parent}){
|
|
190 $l = $extra{parent}->length();
|
|
191 }
|
|
192
|
|
193 if($extra{upstream}){
|
|
194 if($feat->strand < 0){
|
|
195 my $y = $feat->end + 1;
|
|
196 my $z = $feat->end + $extra{upstream};
|
|
197 if($y < $l){
|
|
198 if($z > $l){
|
|
199 $z = $l;
|
|
200 }
|
|
201 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
|
|
202 }
|
|
203 }else{
|
|
204 my $y = $feat->start - $extra{upstream};
|
|
205 my $z = $feat->start - 1;
|
|
206 if($z > 0){
|
|
207 if($y < 1){
|
|
208 $y = 1;
|
|
209 }
|
|
210 $seq .= $extra{parent}->trunc($y, $z)->seq;
|
|
211 }
|
|
212 }
|
|
213 }
|
|
214 if(ref($feat->location) eq 'Bio::Location::Simple'){
|
|
215 $seq .= $feat->seq->seq();
|
|
216 }else{
|
|
217 $seq .= $feat->spliced_seq->seq();
|
|
218 }
|
|
219 if($extra{downstream}){
|
|
220 if($feat->strand < 0){
|
|
221 my $y = $feat->start - $extra{downstream};
|
|
222 my $z = $feat->start - 1;
|
|
223 if($z > 0){
|
|
224 if($y < 1){
|
|
225 $y = 1;
|
|
226 }
|
|
227 $seq .= $extra{parent}->trunc($y, $z)->revcom->seq;
|
|
228 }
|
|
229 }else{
|
|
230 my $y = $feat->end + 1;
|
|
231 my $z = $feat->end + $extra{downstream};
|
|
232 if($y < $l){
|
|
233 if($z > $l){
|
|
234 $z = $l;
|
|
235 }
|
|
236 $seq .= $extra{parent}->trunc($y, $z)->seq;
|
|
237 }
|
|
238 }
|
|
239 }
|
|
240
|
|
241 return $seq;
|
|
242 }
|
|
243
|
|
244
|
|
245 sub translate {
|
|
246 my ($self, $seq) = @_;
|
|
247 if($seq =~ /^[ACTGN]+$/){
|
|
248 my %ct = %{$self->codonTable};
|
|
249 $seq = join( '' , map { if($ct{$_}){ $ct{$_} }else{ () } } unpack("(A3)*", $seq));
|
|
250 }
|
|
251 return $seq;
|
|
252 }
|
|
253
|
|
254 no Moose;
|
|
255 1;
|
|
256
|
|
257 __END__
|
|
258
|
|
259 =pod
|
|
260
|
|
261 =encoding UTF-8
|
|
262
|
|
263 =head1 NAME
|
|
264
|
|
265 CPT::Bio
|
|
266
|
|
267 =head1 VERSION
|
|
268
|
|
269 version 1.99.4
|
|
270
|
|
271 =head2 _getFeatureTag
|
|
272
|
|
273 my $tag = $libCPT->_getFeatureTag($feature,'note');
|
|
274
|
|
275 returns all values of the given tag, joined with ','.
|
|
276
|
|
277 =head2 requestCopy
|
|
278
|
|
279 my $seqobj = $libCPT->requestCopy('file'=>'test.gbk');
|
|
280
|
|
281 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
|
|
282
|
|
283 =head2 getSeqIO
|
|
284
|
|
285 my $seqio = $libCPT->getSeqIO('file'=>'test.gbk');
|
|
286
|
|
287 requests a 'copy' of a given Bio::SeqIO file, which allows for addition of features before writing out to file.
|
|
288
|
|
289 =head2 parseFile
|
|
290
|
|
291 $libCPT->parseFile(
|
|
292 'file' => $options{'file'},
|
|
293 'callback' => \&func,
|
|
294 'translate' => 1,
|
|
295 'header' => 1,
|
|
296 'subset' => ['CDS', $options{'tag'}],
|
|
297 );
|
|
298
|
|
299 Arguably the most important function in this library, wraps a lot of functionality in a clean wrapper, since most of the scripts we have are written around data munging.
|
|
300
|
|
301 =over 4
|
|
302
|
|
303 =item *
|
|
304
|
|
305 file - the Bio::SeqIO file to process
|
|
306
|
|
307 =item *
|
|
308
|
|
309 callback - the function to send our data to. Done all at once, in an array
|
|
310
|
|
311 =item *
|
|
312
|
|
313 translate - should we translate the sequence to amino acids if it's not already.
|
|
314
|
|
315 =item *
|
|
316
|
|
317 subset - either "whole", a valid tag, or an array of valid tags
|
|
318
|
|
319 =item *
|
|
320
|
|
321 header - Do we want a header (FASTA) with our result set
|
|
322
|
|
323 =back
|
|
324
|
|
325 =head1 AUTHOR
|
|
326
|
|
327 Eric Rasche <rasche.eric@yandex.ru>
|
|
328
|
|
329 =head1 COPYRIGHT AND LICENSE
|
|
330
|
|
331 This software is Copyright (c) 2014 by Eric Rasche.
|
|
332
|
|
333 This is free software, licensed under:
|
|
334
|
|
335 The GNU General Public License, Version 3, June 2007
|
|
336
|
|
337 =cut
|