diff lib/CPT/Analysis/TerL.pm @ 1:97ef96676b48 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:26 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/CPT/Analysis/TerL.pm	Mon Jun 05 02:51:26 2023 +0000
@@ -0,0 +1,517 @@
+package CPT::Analysis::TerL;
+
+# ABSTRACT: Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies
+
+use strict;
+use warnings;
+use Data::Dumper;
+use autodie;
+use Moose;
+use Bio::SearchIO;
+use File::ShareDir;
+use File::Spec;
+
+has 'hmmer_evalue_cutoff' => ( is => 'rw', isa => 'Int' );
+has 'blast_evalue_cutoff' => ( is => 'rw', isa => 'Int' );
+has 'blast_dice_cutoff'   => ( is => 'rw', isa => 'Int' );
+
+has 'search_hmmer' => ( is => 'rw', isa => 'Bool' );
+has 'search_blast' => ( is => 'rw', isa => 'Bool' );
+
+has 'data_dir' => ( is => 'rw', isa => 'Any' );
+
+has 'awk_string' => (
+	is  => 'ro',
+	isa => 'Str',
+	default =>
+'BEGIN{print "#row,query id,subject id,evalue,dice" } {qid=$1;sid=$2;percent_identical=$3;query_length=$4;subject_length=$5;evalue=$6;dice=(2*percent_identical*subject_length/ ( subject_length + query_length ) );printf("%d,%s,%s,%s,%s\n",FNR, qid, sid, dice, evalue);}'
+);
+
+has 'input_file' => ( is => 'rw', isa => 'Str' );
+
+my @column_max = ( 20, 10, 20, 10, 20, 60 );
+my %hits;
+
+sub run {
+	my ( $self, %data ) = @_;
+
+	#$self->prepare(%data);
+	my $dir = '/galaxy/tool-data/' # File::ShareDir::dist_dir('CPT-Analysis-TerL');
+	$self->data_dir($dir);
+	$self->input_file( $data{input_file} );
+
+	if ( $self->search_hmmer() ) {
+		$self->hmmer();
+	}
+	if ( $self->search_blast() ) {
+		$self->blast($self);
+	}
+
+	return $self->guess();
+}
+
+sub hmmer {
+	my ($self) = @_;
+
+	use Term::ProgressBar 2.00;
+	my $progress = Term::ProgressBar->new(
+		{
+			name  => 'HMMER',
+			count => 100,
+			ETA   => 'linear',
+		}
+	);
+	$progress->max_update_rate(1);
+
+	my $hmmer_db_dir =
+	  File::Spec->catdir( $self->data_dir(), 'db', 'hmmer' );
+	my @hmmer_dbs = glob( File::Spec->catfile( $hmmer_db_dir, "*.hmm" ) );
+
+	my $i = 0;
+	foreach (@hmmer_dbs) {
+		$progress->update( 100 * ( $i++ ) / scalar @hmmer_dbs );
+		my $db_name = substr( $_, rindex( $_, '/' ) + 1, -4 );
+		my $output_filename = sprintf(
+			'%s.%s.out',
+			substr(
+				$self->input_file(), 0,
+				rindex( $self->input_file(), '.' )
+			),
+			$db_name
+		);
+		my $query = sprintf( 'hmmsearch %s %s > %s',
+			$_, $self->input_file(), $output_filename );
+		system($query);
+
+		my $in = Bio::SearchIO->new(
+			-format => 'hmmer',
+			-file   => $output_filename
+		);
+		while ( my $result = $in->next_result ) {
+			while ( my $hit = $result->next_hit ) {
+				while ( my $hsp = $hit->next_hsp ) {
+					my ( $from, $to ) =
+					  ( $result->query_name, $hit->name );
+					unless ( $hits{$to}{$from} ) {
+						$hits{$to}{$from} = [];
+					}
+					push(
+						$hits{$to}{$from},
+						{
+							'type' => 'hmmer',
+							'data' => {
+								'evalue' => (
+									$hsp
+									  ->evalue
+									  eq '0'
+									? '0.0'
+									: $hsp
+									  ->evalue
+								),
+							}
+						}
+					);
+				}
+			}
+		}
+		unlink($output_filename);
+	}
+	$progress->update(100);
+}
+
+sub blast {
+	my ($self) = @_;
+
+	use Term::ProgressBar 2.00;
+	my $progress = Term::ProgressBar->new(
+		{
+			name  => 'Blast',
+			count => 100,
+			ETA   => 'linear',
+		}
+	);
+	$progress->max_update_rate(1);
+
+	my $blast_db_dir =
+	  File::Spec->catdir( $self->data_dir(), 'db', 'blast' );
+	my @blast_dbs =
+	  map { substr( $_, 0, -4 ) }
+	  glob( File::Spec->catfile( $blast_db_dir, "*.phr" ) );
+
+	my $i = 0;
+
+	#my %hits;
+
+	foreach my $blast_db (@blast_dbs) {
+		$progress->update( 100 * ( $i++ ) / scalar(@blast_dbs) );
+		my $output_str =
+		  substr( $blast_db, rindex( $blast_db, '/' ) + 1 ) . '.csv';
+		my $query = sprintf(
+'psiblast -query %s -db %s -evalue %s -outfmt "6 qseqid sseqid pident qlen slen evalue" | awk \'%s\' > %s',
+			$self->input_file(), $blast_db, '1e-5',
+			$self->awk_string(), $output_str );
+		system($query);
+		open( my $tmpfh, '<', $output_str );
+		while (<$tmpfh>) {
+			chomp $_;
+			if ( $_ !~ /^#/ ) {
+				my @line = split( /,/, $_ );
+				unless ( $hits{ $line[1] }{ $line[2] } ) {
+					$hits{ $line[1] }{ $line[2] } = [];
+				}
+				push(
+					$hits{ $line[1] }{ $line[2] },
+					{
+						'type' => 'psiblast',
+						'data' => {
+							'evalue' => $line[4],
+							'dice'   => $line[3],
+						}
+					}
+				);
+			}
+		}
+		close($tmpfh);
+		unlink($output_str);
+
+	}
+	$progress->update(100);
+}
+
+sub guess {
+	my ($self) = @_;
+
+	open( my $groupings_fh,
+		'<',
+		File::Spec->catfile( $self->data_dir(), 'groupings.tsv' ) );
+
+	# Load groupings.tsv into memory
+	my %data;
+	while (<$groupings_fh>) {
+		if ( $_ !~ /^#/ ) {
+			chomp $_;
+			my ( $major, $minor, $hit ) = split( /\t/, $_ );
+			unless ( $data{$major}{$minor} ) {
+				$data{$major}{$minor} = [];
+			}
+			push( $data{$major}{$minor}, $hit );
+		}
+	}
+
+	# Create a reverse lookup table
+	my %rdata;
+	foreach my $i ( keys %data ) {
+		foreach my $j ( keys %{ $data{$i} } ) {
+			foreach my $k ( @{ $data{$i}{$j} } ) {
+				if ( defined($k) ) {
+					$rdata{$k} = [ $i, $j ];
+				}
+				else {
+					print "$i $j\n";
+				}
+			}
+		}
+	}
+
+	# Table printing stuff
+	my @header = (
+		'Major Category',
+		'Major hits',
+		'Minor Category',
+		'Minor hits',
+		'Analysis',
+		'Evidence Type',
+		'Evidence'
+	);
+	my %output;
+
+	# Loop across the input keys
+	foreach my $input_key ( keys %hits ) {
+		my %guesses;
+		my %guess_evidence;
+
+		# And across all of the hits that the query hit to
+		foreach my $against ( keys %{ $hits{$input_key} } ) {
+
+			# We look at the evidence
+			my @evidence = @{ $hits{$input_key}{$against} };
+
+			my ( $type_major, $type_minor );
+			if ( $rdata{$against} ) {
+				( $type_major, $type_minor ) =
+				  @{ $rdata{$against} };
+			}
+			else {
+				( $type_major, $type_minor ) = (
+					substr(
+						$against, 0,
+						rindex( $against, '_' )
+					),
+					substr(
+						$against,
+						rindex( $against, '_' ) + 1
+					)
+				);
+			}
+
+			# Prepare hashes.
+			unless ( $guesses{$type_major} ) {
+				$guesses{$type_major} = ();
+			}
+			unless ( $guesses{$type_major}{$type_minor} ) {
+				$guesses{$type_major}{$type_minor} = ();
+			}
+
+			# Loop across the evidence
+			foreach my $piece_of_evidence (@evidence) {
+
+				# Here is an example piece of evidence
+				#    'GK_Gilmour_Gene43' => {
+				#        'SP18' => [
+				#             {
+				#               'data' => {
+				#                           'evalue' => '2e-08',
+				#                           'dice' => '23.8627'
+				#                         },
+				#               'type' => 'psiblast'
+				#             }
+				#           ],
+				#
+				if ( $type_major !~ /subject i/ ) {
+					my %piece = %{$piece_of_evidence};
+					if (
+						$self->validate_evidence(
+							$piece_of_evidence) > 0
+					  )
+					{
+						$guess_evidence{$type_major}++;
+						$guess_evidence{
+							"$type_major$type_minor"
+						}++;
+
+				       # If it's not defined, set up sub arrays.
+						unless ( $guesses{$type_major}
+							{$type_minor}
+							{ $piece{'type'} } )
+						{
+							if ( $piece{'type'}
+								eq 'psiblast' )
+							{
+								$guesses{
+									$type_major
+								  }{$type_minor}
+								  {
+									$piece{
+'type'
+									}
+								  }
+								  = {
+									'evalue'
+									  => [],
+									'dice'
+									  => []
+								  };
+							}
+							else {
+								$guesses{
+									$type_major
+								  }{$type_minor}
+								  {
+									$piece{
+'type'
+									}
+								  }
+								  = { 'evalue'
+									  => []
+								  };
+							}
+						}
+
+						# If it's zero, correct to zero.
+						if ( $piece{'data'}{'evalue'}
+							eq '0.0' )
+						{
+							$piece{'data'}{'evalue'}
+							  = '0.0';
+						}
+
+						# Add our evalue
+						push(
+							$guesses{$type_major}
+							  {$type_minor}
+							  { $piece{'type'} }
+							  {'evalue'},
+							$piece{'data'}{'evalue'}
+						);
+
+						# And if psiblast, add dice
+						if ( $piece{'type'} eq
+							'psiblast' )
+						{
+							push(
+								$guesses{
+									$type_major
+								  }{$type_minor}
+								  {
+									$piece{
+'type'
+									}
+								  }{'dice'},
+								$piece{'data'}
+								  {'dice'}
+							);
+						}
+					}
+				}
+			}
+		}
+
+		my @output_sheet;
+
+		foreach my $major ( keys %guesses ) {
+			if ( $guess_evidence{$major} ) {
+				foreach my $minor ( keys %{ $guesses{$major} } )
+				{
+					if ( $guess_evidence{"$major$minor"} ) {
+						foreach my $evidence_category (
+							keys %{
+								$guesses{$major}
+								  {$minor}
+							}
+						  )
+						{
+							if ( $evidence_category
+								ne 'evidence' )
+							{
+						      # things like evalue, dice
+								my %hits = %{
+									$guesses{
+										$major
+									  }{
+										$minor
+									  }{
+										$evidence_category
+									  }
+								};
+								foreach
+								  my $subtype (
+									keys
+									%hits )
+								{ # should be evalue, dice
+									push(
+										@output_sheet,
+										[
+											$major,
+											$guess_evidence{
+												$major
+											  }
+											,
+											$minor,
+											$guess_evidence{
+"$major$minor"
+											  }
+											,
+											$evidence_category,
+											$subtype,
+											join
+											  (
+',',
+												@{
+													$hits{
+														$subtype
+													}
+												}
+											  )
+										]
+									);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+		if ( !scalar @output_sheet ) {
+			@output_sheet = ( ['No evidence above threshold'], );
+		}
+		$output{$input_key} = {
+			header => \@header,
+			data   => \@output_sheet,
+		};
+	}
+	return \%output;
+}
+
+sub validate_evidence {
+	my ( $self, $piece_of_evidence ) = @_;
+	my %piece = %{$piece_of_evidence};
+
+	#my ($self, $type, $subtype, $value) = @_;
+	#             {
+	#               'data' => {
+	#                           'evalue' => '2e-08',
+	#                           'dice' => '23.8627'
+	#                         },
+	#               'type' => 'psiblast'
+	#             }
+	if ( $piece{type} eq 'hmmer' ) {
+		my $value = $piece{data}{evalue};
+		if ( $value eq '0.0' || $value eq '0' ) {
+			return 1;
+		}
+		elsif ( !defined($value) || $value eq '' ) {
+			return 0;
+		}
+		else {
+			return ( log($value) < $self->hmmer_evalue_cutoff() )
+			  ;    # -64
+		}
+	}
+	elsif ( $piece{type} eq 'psiblast' ) {
+		my $evalue = $piece{data}{evalue};
+		my $dice   = $piece{data}{dice};
+
+		my $evalue_return = 0;
+		if ( $evalue eq '0.0' || $evalue eq '0' ) {
+			$evalue_return = 1;
+		}
+		else {
+			$evalue_return =
+			  ( log($evalue) < $self->blast_evalue_cutoff() ); #-140
+		}
+
+		return $evalue_return
+		  && ( $dice > $self->blast_dice_cutoff() );               # 30
+	}
+}
+
+1;
+
+__END__
+
+=pod
+
+=encoding UTF-8
+
+=head1 NAME
+
+CPT::Analysis::TerL - Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies
+
+=head1 VERSION
+
+version 1.96
+
+=head1 AUTHOR
+
+Eric Rasche <rasche.eric@yandex.ru>
+
+=head1 COPYRIGHT AND LICENSE
+
+This software is Copyright (c) 2014 by Eric Rasche.
+
+This is free software, licensed under:
+
+  The GNU General Public License, Version 3, June 2007
+
+=cut