comparison lib/CPT/Analysis/TerL.pm @ 1:97ef96676b48 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:26 +0000
parents
children
comparison
equal deleted inserted replaced
0:b18e8268bf4e 1:97ef96676b48
1 package CPT::Analysis::TerL;
2
3 # ABSTRACT: Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies
4
5 use strict;
6 use warnings;
7 use Data::Dumper;
8 use autodie;
9 use Moose;
10 use Bio::SearchIO;
11 use File::ShareDir;
12 use File::Spec;
13
14 has 'hmmer_evalue_cutoff' => ( is => 'rw', isa => 'Int' );
15 has 'blast_evalue_cutoff' => ( is => 'rw', isa => 'Int' );
16 has 'blast_dice_cutoff' => ( is => 'rw', isa => 'Int' );
17
18 has 'search_hmmer' => ( is => 'rw', isa => 'Bool' );
19 has 'search_blast' => ( is => 'rw', isa => 'Bool' );
20
21 has 'data_dir' => ( is => 'rw', isa => 'Any' );
22
23 has 'awk_string' => (
24 is => 'ro',
25 isa => 'Str',
26 default =>
27 'BEGIN{print "#row,query id,subject id,evalue,dice" } {qid=$1;sid=$2;percent_identical=$3;query_length=$4;subject_length=$5;evalue=$6;dice=(2*percent_identical*subject_length/ ( subject_length + query_length ) );printf("%d,%s,%s,%s,%s\n",FNR, qid, sid, dice, evalue);}'
28 );
29
30 has 'input_file' => ( is => 'rw', isa => 'Str' );
31
32 my @column_max = ( 20, 10, 20, 10, 20, 60 );
33 my %hits;
34
35 sub run {
36 my ( $self, %data ) = @_;
37
38 #$self->prepare(%data);
39 my $dir = '/galaxy/tool-data/' # File::ShareDir::dist_dir('CPT-Analysis-TerL');
40 $self->data_dir($dir);
41 $self->input_file( $data{input_file} );
42
43 if ( $self->search_hmmer() ) {
44 $self->hmmer();
45 }
46 if ( $self->search_blast() ) {
47 $self->blast($self);
48 }
49
50 return $self->guess();
51 }
52
53 sub hmmer {
54 my ($self) = @_;
55
56 use Term::ProgressBar 2.00;
57 my $progress = Term::ProgressBar->new(
58 {
59 name => 'HMMER',
60 count => 100,
61 ETA => 'linear',
62 }
63 );
64 $progress->max_update_rate(1);
65
66 my $hmmer_db_dir =
67 File::Spec->catdir( $self->data_dir(), 'db', 'hmmer' );
68 my @hmmer_dbs = glob( File::Spec->catfile( $hmmer_db_dir, "*.hmm" ) );
69
70 my $i = 0;
71 foreach (@hmmer_dbs) {
72 $progress->update( 100 * ( $i++ ) / scalar @hmmer_dbs );
73 my $db_name = substr( $_, rindex( $_, '/' ) + 1, -4 );
74 my $output_filename = sprintf(
75 '%s.%s.out',
76 substr(
77 $self->input_file(), 0,
78 rindex( $self->input_file(), '.' )
79 ),
80 $db_name
81 );
82 my $query = sprintf( 'hmmsearch %s %s > %s',
83 $_, $self->input_file(), $output_filename );
84 system($query);
85
86 my $in = Bio::SearchIO->new(
87 -format => 'hmmer',
88 -file => $output_filename
89 );
90 while ( my $result = $in->next_result ) {
91 while ( my $hit = $result->next_hit ) {
92 while ( my $hsp = $hit->next_hsp ) {
93 my ( $from, $to ) =
94 ( $result->query_name, $hit->name );
95 unless ( $hits{$to}{$from} ) {
96 $hits{$to}{$from} = [];
97 }
98 push(
99 $hits{$to}{$from},
100 {
101 'type' => 'hmmer',
102 'data' => {
103 'evalue' => (
104 $hsp
105 ->evalue
106 eq '0'
107 ? '0.0'
108 : $hsp
109 ->evalue
110 ),
111 }
112 }
113 );
114 }
115 }
116 }
117 unlink($output_filename);
118 }
119 $progress->update(100);
120 }
121
122 sub blast {
123 my ($self) = @_;
124
125 use Term::ProgressBar 2.00;
126 my $progress = Term::ProgressBar->new(
127 {
128 name => 'Blast',
129 count => 100,
130 ETA => 'linear',
131 }
132 );
133 $progress->max_update_rate(1);
134
135 my $blast_db_dir =
136 File::Spec->catdir( $self->data_dir(), 'db', 'blast' );
137 my @blast_dbs =
138 map { substr( $_, 0, -4 ) }
139 glob( File::Spec->catfile( $blast_db_dir, "*.phr" ) );
140
141 my $i = 0;
142
143 #my %hits;
144
145 foreach my $blast_db (@blast_dbs) {
146 $progress->update( 100 * ( $i++ ) / scalar(@blast_dbs) );
147 my $output_str =
148 substr( $blast_db, rindex( $blast_db, '/' ) + 1 ) . '.csv';
149 my $query = sprintf(
150 'psiblast -query %s -db %s -evalue %s -outfmt "6 qseqid sseqid pident qlen slen evalue" | awk \'%s\' > %s',
151 $self->input_file(), $blast_db, '1e-5',
152 $self->awk_string(), $output_str );
153 system($query);
154 open( my $tmpfh, '<', $output_str );
155 while (<$tmpfh>) {
156 chomp $_;
157 if ( $_ !~ /^#/ ) {
158 my @line = split( /,/, $_ );
159 unless ( $hits{ $line[1] }{ $line[2] } ) {
160 $hits{ $line[1] }{ $line[2] } = [];
161 }
162 push(
163 $hits{ $line[1] }{ $line[2] },
164 {
165 'type' => 'psiblast',
166 'data' => {
167 'evalue' => $line[4],
168 'dice' => $line[3],
169 }
170 }
171 );
172 }
173 }
174 close($tmpfh);
175 unlink($output_str);
176
177 }
178 $progress->update(100);
179 }
180
181 sub guess {
182 my ($self) = @_;
183
184 open( my $groupings_fh,
185 '<',
186 File::Spec->catfile( $self->data_dir(), 'groupings.tsv' ) );
187
188 # Load groupings.tsv into memory
189 my %data;
190 while (<$groupings_fh>) {
191 if ( $_ !~ /^#/ ) {
192 chomp $_;
193 my ( $major, $minor, $hit ) = split( /\t/, $_ );
194 unless ( $data{$major}{$minor} ) {
195 $data{$major}{$minor} = [];
196 }
197 push( $data{$major}{$minor}, $hit );
198 }
199 }
200
201 # Create a reverse lookup table
202 my %rdata;
203 foreach my $i ( keys %data ) {
204 foreach my $j ( keys %{ $data{$i} } ) {
205 foreach my $k ( @{ $data{$i}{$j} } ) {
206 if ( defined($k) ) {
207 $rdata{$k} = [ $i, $j ];
208 }
209 else {
210 print "$i $j\n";
211 }
212 }
213 }
214 }
215
216 # Table printing stuff
217 my @header = (
218 'Major Category',
219 'Major hits',
220 'Minor Category',
221 'Minor hits',
222 'Analysis',
223 'Evidence Type',
224 'Evidence'
225 );
226 my %output;
227
228 # Loop across the input keys
229 foreach my $input_key ( keys %hits ) {
230 my %guesses;
231 my %guess_evidence;
232
233 # And across all of the hits that the query hit to
234 foreach my $against ( keys %{ $hits{$input_key} } ) {
235
236 # We look at the evidence
237 my @evidence = @{ $hits{$input_key}{$against} };
238
239 my ( $type_major, $type_minor );
240 if ( $rdata{$against} ) {
241 ( $type_major, $type_minor ) =
242 @{ $rdata{$against} };
243 }
244 else {
245 ( $type_major, $type_minor ) = (
246 substr(
247 $against, 0,
248 rindex( $against, '_' )
249 ),
250 substr(
251 $against,
252 rindex( $against, '_' ) + 1
253 )
254 );
255 }
256
257 # Prepare hashes.
258 unless ( $guesses{$type_major} ) {
259 $guesses{$type_major} = ();
260 }
261 unless ( $guesses{$type_major}{$type_minor} ) {
262 $guesses{$type_major}{$type_minor} = ();
263 }
264
265 # Loop across the evidence
266 foreach my $piece_of_evidence (@evidence) {
267
268 # Here is an example piece of evidence
269 # 'GK_Gilmour_Gene43' => {
270 # 'SP18' => [
271 # {
272 # 'data' => {
273 # 'evalue' => '2e-08',
274 # 'dice' => '23.8627'
275 # },
276 # 'type' => 'psiblast'
277 # }
278 # ],
279 #
280 if ( $type_major !~ /subject i/ ) {
281 my %piece = %{$piece_of_evidence};
282 if (
283 $self->validate_evidence(
284 $piece_of_evidence) > 0
285 )
286 {
287 $guess_evidence{$type_major}++;
288 $guess_evidence{
289 "$type_major$type_minor"
290 }++;
291
292 # If it's not defined, set up sub arrays.
293 unless ( $guesses{$type_major}
294 {$type_minor}
295 { $piece{'type'} } )
296 {
297 if ( $piece{'type'}
298 eq 'psiblast' )
299 {
300 $guesses{
301 $type_major
302 }{$type_minor}
303 {
304 $piece{
305 'type'
306 }
307 }
308 = {
309 'evalue'
310 => [],
311 'dice'
312 => []
313 };
314 }
315 else {
316 $guesses{
317 $type_major
318 }{$type_minor}
319 {
320 $piece{
321 'type'
322 }
323 }
324 = { 'evalue'
325 => []
326 };
327 }
328 }
329
330 # If it's zero, correct to zero.
331 if ( $piece{'data'}{'evalue'}
332 eq '0.0' )
333 {
334 $piece{'data'}{'evalue'}
335 = '0.0';
336 }
337
338 # Add our evalue
339 push(
340 $guesses{$type_major}
341 {$type_minor}
342 { $piece{'type'} }
343 {'evalue'},
344 $piece{'data'}{'evalue'}
345 );
346
347 # And if psiblast, add dice
348 if ( $piece{'type'} eq
349 'psiblast' )
350 {
351 push(
352 $guesses{
353 $type_major
354 }{$type_minor}
355 {
356 $piece{
357 'type'
358 }
359 }{'dice'},
360 $piece{'data'}
361 {'dice'}
362 );
363 }
364 }
365 }
366 }
367 }
368
369 my @output_sheet;
370
371 foreach my $major ( keys %guesses ) {
372 if ( $guess_evidence{$major} ) {
373 foreach my $minor ( keys %{ $guesses{$major} } )
374 {
375 if ( $guess_evidence{"$major$minor"} ) {
376 foreach my $evidence_category (
377 keys %{
378 $guesses{$major}
379 {$minor}
380 }
381 )
382 {
383 if ( $evidence_category
384 ne 'evidence' )
385 {
386 # things like evalue, dice
387 my %hits = %{
388 $guesses{
389 $major
390 }{
391 $minor
392 }{
393 $evidence_category
394 }
395 };
396 foreach
397 my $subtype (
398 keys
399 %hits )
400 { # should be evalue, dice
401 push(
402 @output_sheet,
403 [
404 $major,
405 $guess_evidence{
406 $major
407 }
408 ,
409 $minor,
410 $guess_evidence{
411 "$major$minor"
412 }
413 ,
414 $evidence_category,
415 $subtype,
416 join
417 (
418 ',',
419 @{
420 $hits{
421 $subtype
422 }
423 }
424 )
425 ]
426 );
427 }
428 }
429 }
430 }
431 }
432 }
433 }
434
435 if ( !scalar @output_sheet ) {
436 @output_sheet = ( ['No evidence above threshold'], );
437 }
438 $output{$input_key} = {
439 header => \@header,
440 data => \@output_sheet,
441 };
442 }
443 return \%output;
444 }
445
446 sub validate_evidence {
447 my ( $self, $piece_of_evidence ) = @_;
448 my %piece = %{$piece_of_evidence};
449
450 #my ($self, $type, $subtype, $value) = @_;
451 # {
452 # 'data' => {
453 # 'evalue' => '2e-08',
454 # 'dice' => '23.8627'
455 # },
456 # 'type' => 'psiblast'
457 # }
458 if ( $piece{type} eq 'hmmer' ) {
459 my $value = $piece{data}{evalue};
460 if ( $value eq '0.0' || $value eq '0' ) {
461 return 1;
462 }
463 elsif ( !defined($value) || $value eq '' ) {
464 return 0;
465 }
466 else {
467 return ( log($value) < $self->hmmer_evalue_cutoff() )
468 ; # -64
469 }
470 }
471 elsif ( $piece{type} eq 'psiblast' ) {
472 my $evalue = $piece{data}{evalue};
473 my $dice = $piece{data}{dice};
474
475 my $evalue_return = 0;
476 if ( $evalue eq '0.0' || $evalue eq '0' ) {
477 $evalue_return = 1;
478 }
479 else {
480 $evalue_return =
481 ( log($evalue) < $self->blast_evalue_cutoff() ); #-140
482 }
483
484 return $evalue_return
485 && ( $dice > $self->blast_dice_cutoff() ); # 30
486 }
487 }
488
489 1;
490
491 __END__
492
493 =pod
494
495 =encoding UTF-8
496
497 =head1 NAME
498
499 CPT::Analysis::TerL - Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies
500
501 =head1 VERSION
502
503 version 1.96
504
505 =head1 AUTHOR
506
507 Eric Rasche <rasche.eric@yandex.ru>
508
509 =head1 COPYRIGHT AND LICENSE
510
511 This software is Copyright (c) 2014 by Eric Rasche.
512
513 This is free software, licensed under:
514
515 The GNU General Public License, Version 3, June 2007
516
517 =cut