view micomplete.pl @ 0:23e768cddc7d draft

Initial commit of micomplete
author lionelguy
date Thu, 17 Oct 2013 08:39:52 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl -w

=head1 SYNOPSIS

micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.

=head1 USAGE

micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
micomplete.pl --help

=head1 INPUT

=head2 -h|--hmm

A single file containing the HMM profiles against which the proteome will be run.

=head2 -p|--proteome

A fasta file containing all protein sequences in a genome.

=head2 [-c|--cut-off]

The evalue cut-off to accept the presence of a HMM. 1e-10 by default.

=head2 [-w|--weights]

Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. 

=head2 [-e|--evalue]

Shows evalue to the best hit, instead of presence/absence.

=head2 [-s|--save-result-tab]

Save the result per marker

=head2 [-r|--save-hmm-tab]

Save hmmsearch tabular result for this file.

=head2 [-a|--save-hmm-alignments]

Save hmmsearch tabular result for this file.

=head2 [-t|--threads]

Number of threads to use in the hmmsearch. 15 by default.

=head2 --help

Displays a help message.

=head1 OUTPUT

A completeness report expressed in fraction, and a list of all hmms identified as positive

=head1 AUTHOR

Lionel Guy (lionel.guy@icm.uu.se)

=head1 DATE

Mon Oct 14 14:09:46 CEST 2013

=cut

# libraries
use strict;
use Pod::Usage;
use Getopt::Long;
# For safe use of tempfiles
use File::Temp qw/ tempfile tempdir /; 
use File::Copy qw/ cp /;

# Options
my ($hmm_file, $fasta_in, $weights_file, $display_eval);
my $cutoff = 1e-10;
my ($save_tab_file, $save_aln_file, $save_res_tab);
my $nthreads = 15;
my $help;
GetOptions(
    'h|hmm=s'      => \$hmm_file,
    'p|proteome=s' => \$fasta_in,
    'c|cut-off=s'  => \$cutoff,
    'w|weights=s'  => \$weights_file,
    'e|evalue'     => \$display_eval,
    'r|save-hmm-tab=s'        => \$save_tab_file,
    'a|save-hmm-alignments=s' => \$save_aln_file,
    's|save-result-tab=s'     => \$save_res_tab,
    't|threads=s'  => \$nthreads,
    'help'         => \$help,
);

# Tempfile if not saving results
my $template = 'tempXXXXXXXX';
my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);

# Check options
pod2usage(-verbose => 1) if $help;
pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", 
	  -exitstatus => 2 ) unless ($hmm_file);
pod2usage(-message => "No proteome file (-p|--proteome) defined\n", 
	  -exitstatus => 2 ) unless ($fasta_in);

# Parse eventual weights
my %hmms;
if ($weights_file){
    open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
    while (<WEIGHTS>){
	next if /^#/;
	my ($name, $weight) = split;
	$hmms{$name} = $weight;
    }
}

# Parse HMM, lists names
open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
while (<HMMS>){
    next unless /^NAME/;
    my ($bla, $name) = split;
    if ($weights_file) {
	die "HMM $name doesn't have weight defined" unless $hmms{$name};
    } else {
	$hmms{$name}++;
    }
}

# Get sum of weights
my $total;
foreach (values %hmms) {
    $total += $_;
}
#print STDERR "Sum of weights: $total\n";

# Run hmmsearch
my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
$cmd .= "--noali " unless $save_aln_file;
$cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
system($cmd) == 0 or die ("Command $cmd failed: $?");

# Read HMMER output. Assumes the first hit is always the best
open HMMER, '<', $tab_file or die $!;
my %seen;
my $sum;
while (<HMMER>){
    next if /^#/;
    my ($gid, $dash, $hname, $hid, $evalue) = split;
    #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
    die ("No gid at $_\n") unless $gid;
    die ("HMM not seen in the hmm file") unless $hmms{$hname};
    # Skip if not first hit, or if evalue under cut-off
    unless ($seen{$hname} || $evalue > $cutoff){
	$sum += $hmms{$hname};
	$seen{$hname} = $evalue;
    }
}

# Save results if required
cp($tab_file, $save_tab_file) if ($save_tab_file);
cp($aln_file, $save_aln_file) if ($save_aln_file);

# Print results
printf "Completeness: %.4f", $sum/$total;
print "\nN markers found: ", scalar(keys %seen), " out of ", 
   scalar(keys %hmms), "\n";
if ($save_res_tab){
    open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
    print TAB "#marker\t"; 
    if ($display_eval) {
	print TAB "evalue\n";
    } else {
	print TAB "present\n";
    }
    foreach my $hmm (sort keys %hmms){
	print TAB "$hmm\t";
	if ($seen{$hmm}){
	    if ($display_eval){
		print TAB "$seen{$hmm}\n";
	    } else {
		print TAB "1\n";
	    }
	} else {
	    if ($display_eval){
		print TAB "NA\n";
	    } else {
		print TAB "0\n";
	    }
	}
    }
}