diff micomplete.pl @ 0:23e768cddc7d draft

Initial commit of micomplete
author lionelguy
date Thu, 17 Oct 2013 08:39:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/micomplete.pl	Thu Oct 17 08:39:52 2013 -0400
@@ -0,0 +1,193 @@
+#!/usr/bin/perl -w
+
+=head1 SYNOPSIS
+
+micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.
+
+=head1 USAGE
+
+micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
+micomplete.pl --help
+
+=head1 INPUT
+
+=head2 -h|--hmm
+
+A single file containing the HMM profiles against which the proteome will be run.
+
+=head2 -p|--proteome
+
+A fasta file containing all protein sequences in a genome.
+
+=head2 [-c|--cut-off]
+
+The evalue cut-off to accept the presence of a HMM. 1e-10 by default.
+
+=head2 [-w|--weights]
+
+Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. 
+
+=head2 [-e|--evalue]
+
+Shows evalue to the best hit, instead of presence/absence.
+
+=head2 [-s|--save-result-tab]
+
+Save the result per marker
+
+=head2 [-r|--save-hmm-tab]
+
+Save hmmsearch tabular result for this file.
+
+=head2 [-a|--save-hmm-alignments]
+
+Save hmmsearch tabular result for this file.
+
+=head2 [-t|--threads]
+
+Number of threads to use in the hmmsearch. 15 by default.
+
+=head2 --help
+
+Displays a help message.
+
+=head1 OUTPUT
+
+A completeness report expressed in fraction, and a list of all hmms identified as positive
+
+=head1 AUTHOR
+
+Lionel Guy (lionel.guy@icm.uu.se)
+
+=head1 DATE
+
+Mon Oct 14 14:09:46 CEST 2013
+
+=cut
+
+# libraries
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+# For safe use of tempfiles
+use File::Temp qw/ tempfile tempdir /; 
+use File::Copy qw/ cp /;
+
+# Options
+my ($hmm_file, $fasta_in, $weights_file, $display_eval);
+my $cutoff = 1e-10;
+my ($save_tab_file, $save_aln_file, $save_res_tab);
+my $nthreads = 15;
+my $help;
+GetOptions(
+    'h|hmm=s'      => \$hmm_file,
+    'p|proteome=s' => \$fasta_in,
+    'c|cut-off=s'  => \$cutoff,
+    'w|weights=s'  => \$weights_file,
+    'e|evalue'     => \$display_eval,
+    'r|save-hmm-tab=s'        => \$save_tab_file,
+    'a|save-hmm-alignments=s' => \$save_aln_file,
+    's|save-result-tab=s'     => \$save_res_tab,
+    't|threads=s'  => \$nthreads,
+    'help'         => \$help,
+);
+
+# Tempfile if not saving results
+my $template = 'tempXXXXXXXX';
+my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
+my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);
+
+# Check options
+pod2usage(-verbose => 1) if $help;
+pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", 
+	  -exitstatus => 2 ) unless ($hmm_file);
+pod2usage(-message => "No proteome file (-p|--proteome) defined\n", 
+	  -exitstatus => 2 ) unless ($fasta_in);
+
+# Parse eventual weights
+my %hmms;
+if ($weights_file){
+    open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
+    while (<WEIGHTS>){
+	next if /^#/;
+	my ($name, $weight) = split;
+	$hmms{$name} = $weight;
+    }
+}
+
+# Parse HMM, lists names
+open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
+while (<HMMS>){
+    next unless /^NAME/;
+    my ($bla, $name) = split;
+    if ($weights_file) {
+	die "HMM $name doesn't have weight defined" unless $hmms{$name};
+    } else {
+	$hmms{$name}++;
+    }
+}
+
+# Get sum of weights
+my $total;
+foreach (values %hmms) {
+    $total += $_;
+}
+#print STDERR "Sum of weights: $total\n";
+
+# Run hmmsearch
+my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
+$cmd .= "--noali " unless $save_aln_file;
+$cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
+system($cmd) == 0 or die ("Command $cmd failed: $?");
+
+# Read HMMER output. Assumes the first hit is always the best
+open HMMER, '<', $tab_file or die $!;
+my %seen;
+my $sum;
+while (<HMMER>){
+    next if /^#/;
+    my ($gid, $dash, $hname, $hid, $evalue) = split;
+    #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
+    die ("No gid at $_\n") unless $gid;
+    die ("HMM not seen in the hmm file") unless $hmms{$hname};
+    # Skip if not first hit, or if evalue under cut-off
+    unless ($seen{$hname} || $evalue > $cutoff){
+	$sum += $hmms{$hname};
+	$seen{$hname} = $evalue;
+    }
+}
+
+# Save results if required
+cp($tab_file, $save_tab_file) if ($save_tab_file);
+cp($aln_file, $save_aln_file) if ($save_aln_file);
+
+# Print results
+printf "Completeness: %.4f", $sum/$total;
+print "\nN markers found: ", scalar(keys %seen), " out of ", 
+   scalar(keys %hmms), "\n";
+if ($save_res_tab){
+    open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
+    print TAB "#marker\t"; 
+    if ($display_eval) {
+	print TAB "evalue\n";
+    } else {
+	print TAB "present\n";
+    }
+    foreach my $hmm (sort keys %hmms){
+	print TAB "$hmm\t";
+	if ($seen{$hmm}){
+	    if ($display_eval){
+		print TAB "$seen{$hmm}\n";
+	    } else {
+		print TAB "1\n";
+	    }
+	} else {
+	    if ($display_eval){
+		print TAB "NA\n";
+	    } else {
+		print TAB "0\n";
+	    }
+	}
+    }
+}
+