changeset 0:23e768cddc7d draft

Initial commit of micomplete
author lionelguy
date Thu, 17 Oct 2013 08:39:52 -0400
parents
children 609b98cf0f9f
files micomplete.pl micomplete.xml micomplete_hmms.loc.sample tool_dependencies.xml
diffstat 4 files changed, 298 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/micomplete.pl	Thu Oct 17 08:39:52 2013 -0400
@@ -0,0 +1,193 @@
+#!/usr/bin/perl -w
+
+=head1 SYNOPSIS
+
+micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.
+
+=head1 USAGE
+
+micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
+micomplete.pl --help
+
+=head1 INPUT
+
+=head2 -h|--hmm
+
+A single file containing the HMM profiles against which the proteome will be run.
+
+=head2 -p|--proteome
+
+A fasta file containing all protein sequences in a genome.
+
+=head2 [-c|--cut-off]
+
+The evalue cut-off to accept the presence of a HMM. 1e-10 by default.
+
+=head2 [-w|--weights]
+
+Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. 
+
+=head2 [-e|--evalue]
+
+Shows evalue to the best hit, instead of presence/absence.
+
+=head2 [-s|--save-result-tab]
+
+Save the result per marker
+
+=head2 [-r|--save-hmm-tab]
+
+Save hmmsearch tabular result for this file.
+
+=head2 [-a|--save-hmm-alignments]
+
+Save hmmsearch tabular result for this file.
+
+=head2 [-t|--threads]
+
+Number of threads to use in the hmmsearch. 15 by default.
+
+=head2 --help
+
+Displays a help message.
+
+=head1 OUTPUT
+
+A completeness report expressed in fraction, and a list of all hmms identified as positive
+
+=head1 AUTHOR
+
+Lionel Guy (lionel.guy@icm.uu.se)
+
+=head1 DATE
+
+Mon Oct 14 14:09:46 CEST 2013
+
+=cut
+
+# libraries
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+# For safe use of tempfiles
+use File::Temp qw/ tempfile tempdir /; 
+use File::Copy qw/ cp /;
+
+# Options
+my ($hmm_file, $fasta_in, $weights_file, $display_eval);
+my $cutoff = 1e-10;
+my ($save_tab_file, $save_aln_file, $save_res_tab);
+my $nthreads = 15;
+my $help;
+GetOptions(
+    'h|hmm=s'      => \$hmm_file,
+    'p|proteome=s' => \$fasta_in,
+    'c|cut-off=s'  => \$cutoff,
+    'w|weights=s'  => \$weights_file,
+    'e|evalue'     => \$display_eval,
+    'r|save-hmm-tab=s'        => \$save_tab_file,
+    'a|save-hmm-alignments=s' => \$save_aln_file,
+    's|save-result-tab=s'     => \$save_res_tab,
+    't|threads=s'  => \$nthreads,
+    'help'         => \$help,
+);
+
+# Tempfile if not saving results
+my $template = 'tempXXXXXXXX';
+my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
+my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);
+
+# Check options
+pod2usage(-verbose => 1) if $help;
+pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", 
+	  -exitstatus => 2 ) unless ($hmm_file);
+pod2usage(-message => "No proteome file (-p|--proteome) defined\n", 
+	  -exitstatus => 2 ) unless ($fasta_in);
+
+# Parse eventual weights
+my %hmms;
+if ($weights_file){
+    open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
+    while (<WEIGHTS>){
+	next if /^#/;
+	my ($name, $weight) = split;
+	$hmms{$name} = $weight;
+    }
+}
+
+# Parse HMM, lists names
+open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
+while (<HMMS>){
+    next unless /^NAME/;
+    my ($bla, $name) = split;
+    if ($weights_file) {
+	die "HMM $name doesn't have weight defined" unless $hmms{$name};
+    } else {
+	$hmms{$name}++;
+    }
+}
+
+# Get sum of weights
+my $total;
+foreach (values %hmms) {
+    $total += $_;
+}
+#print STDERR "Sum of weights: $total\n";
+
+# Run hmmsearch
+my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
+$cmd .= "--noali " unless $save_aln_file;
+$cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
+system($cmd) == 0 or die ("Command $cmd failed: $?");
+
+# Read HMMER output. Assumes the first hit is always the best
+open HMMER, '<', $tab_file or die $!;
+my %seen;
+my $sum;
+while (<HMMER>){
+    next if /^#/;
+    my ($gid, $dash, $hname, $hid, $evalue) = split;
+    #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
+    die ("No gid at $_\n") unless $gid;
+    die ("HMM not seen in the hmm file") unless $hmms{$hname};
+    # Skip if not first hit, or if evalue under cut-off
+    unless ($seen{$hname} || $evalue > $cutoff){
+	$sum += $hmms{$hname};
+	$seen{$hname} = $evalue;
+    }
+}
+
+# Save results if required
+cp($tab_file, $save_tab_file) if ($save_tab_file);
+cp($aln_file, $save_aln_file) if ($save_aln_file);
+
+# Print results
+printf "Completeness: %.4f", $sum/$total;
+print "\nN markers found: ", scalar(keys %seen), " out of ", 
+   scalar(keys %hmms), "\n";
+if ($save_res_tab){
+    open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
+    print TAB "#marker\t"; 
+    if ($display_eval) {
+	print TAB "evalue\n";
+    } else {
+	print TAB "present\n";
+    }
+    foreach my $hmm (sort keys %hmms){
+	print TAB "$hmm\t";
+	if ($seen{$hmm}){
+	    if ($display_eval){
+		print TAB "$seen{$hmm}\n";
+	    } else {
+		print TAB "1\n";
+	    }
+	} else {
+	    if ($display_eval){
+		print TAB "NA\n";
+	    } else {
+		print TAB "0\n";
+	    }
+	}
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/micomplete.xml	Thu Oct 17 08:39:52 2013 -0400
@@ -0,0 +1,90 @@
+<tool id="micomplete" name="micomplete" version="0.1">
+  <description>Using hmm profiles of conserved proteins, reports how much complete a (single-cell amplified) genome is</description>
+  <requirements>
+    <requirement type="package" version="3.1b1">hmmer</requirement>
+  </requirements>
+  <command interpreter="perl">micomplete.pl 
+  #if $hmm_source.hmm_source_type == "cached"
+  --hmm "${ hmm_source.profiles.fields.hmms }"
+  --weights "${ hmm_source.profiles.fields.weights }"
+  #else
+  --hmm "${ hmm_source.own_profiles }"
+    #if $hmm_source.weights.use_weights
+    --weights "${ hmm_source.weights.own_weights }"
+    #end if
+  #end if
+  --proteome $proteome
+  --cut-off $cutoff
+  #if $evalue
+  --evalue 
+  #end if
+  #if $save_hmm_tab
+  --save-hmm-tab $hmm_tab
+  #end if
+  #if $save_hmm_alignments
+  --save-hmm-alignments $hmm_alignments
+  #end if
+  #if $save_res_tab
+  --save-result-tab $res_tab
+  #end if
+  --threads $threads > $output
+  </command>
+  <inputs>
+    <param name="proteome" type="data" format="fasta" label="Proteome file" />
+    <conditional name="hmm_source">
+      <param name="hmm_source_type" type="select" label="HMM file from history or from built-in profile?">
+	<option value="cached">Use a built-in profile</option>
+	<option value="history">Use a profile from the history</option>
+      </param>
+      <when value="cached">
+	<param name="profiles" type="select" label="Select a reference set">
+	  <options from_data_table="micomplete_hmms">
+	    <filter type="sort_by" column="2" />
+	  </options>
+	</param>
+      </when>
+      <when value="history">
+	<param name="own_profiles" type="data" format="text" label="Select a HMM set in the history" />
+	<conditional name="weights">
+	  <param name="use_weights" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Use weights?" />
+	  <when value="True">
+	    <param name="own_weights" type="data" format="tabular" label="Select a weight file (optional)" help="There should be exactly one weight for each profile in the profiles file"/>
+	  </when>
+	</conditional>
+      </when>
+    </conditional>
+    <param name="cutoff" type="float" value="1e-10" label="E-value cutoff to include hits" />
+    <param name="evalue" type="boolean" checked="true" truevalue="True" falsevalue="False" label="Show E-values on the output instead of 0/1?" />
+    <param name="save_res_tab" type="boolean" checked="true" truevalue="True" falsevalue="False" label="Save per marker result tabular file?" />
+    <param name="save_hmm_tab" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Save HMM tab result file?" />
+    <param name="save_hmm_alignments" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Save HMM alignments result file?" />
+    <param name="threads" type="integer" value="15" min="1" label="Number of threads to use" />
+  </inputs>
+  <outputs>
+    <data format="txt" name="output" label="Completeness report" />
+    <data format="tabular" name="res_tab" label="HMM report, tabular format"> 
+      <filter>save_res_tab is True</filter>
+    </data>
+    <data format="tabular" name="hmm_tab" label="HMM report, tabular format"> 
+      <filter>save_hmm_tab is True</filter>
+    </data>
+    <data format="txt" name="hmm_alignments" label="HMM alignments"> 
+      <filter>save_hmm_alignments is True</filter>
+    </data>
+  </outputs>
+  <help>
+***What it does***
+
+Micomplete estimates the level of completion of a genome. This is particularly useful for single-cell amplified genomes. It aligns a proteome to a set of "markers" or conserved proteins, represented by a collection of HMM profiles, possibly with weights. It uses hmmsearch (http://hmmer.janelia.org/, see for example S.R Eddy, PLOS Comp Biol 2011, 7:e1002195) to align proteins
+
+Weights are used to alleviate the effect of having/not having a specific contig which would contain many markers that are generally clustering together. Typically, ribosomal proteins, often used are markers, are often organized in operons. Thus, the impact of having/not having these operons is very important, and should be downplayed to get a more correct estimate.
+
+**License**
+
+This wrapper is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this program.  If not, see http://www.gnu.org/licenses/.
+  </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/micomplete_hmms.loc.sample	Thu Oct 17 08:39:52 2013 -0400
@@ -0,0 +1,9 @@
+#This is a sample file distributed with Galaxy that enables tools to use 
+#a directory of hmm profiles and their weights to calculate completeness.
+#The micomplete_hmms.loc has the following format (white spaces are TABs)
+#
+#<set id>	<description>	<hmm_file_path>	<hmm_weights_path>
+#
+#So, for example for Arch162 (162 conserved markers in archaea):
+#Arch162 	 162 markers for Archaea    /seq/galaxy_datatables/micomplete/Arch162.hmm /seq/galaxy_datatables/micomplete/Arch162.weights
+#Bact139 	 139 markers for Bacteria    /seq/galaxy_datatables/micomplete/Bact139.hmm /seq/galaxy_datatables/micomplete/Bact139.weights
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Oct 17 08:39:52 2013 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="hmmer" version="3.1b1">
+    <repository changeset_revision="007c736bf7e8" name="package_hmmer_3_1" owner="lionelguy" toolshed="http://toolshed.g2.bx.psu.edu" />
+  </package>
+</tool_dependency>