Mercurial > repos > lionelguy > micomplete
changeset 0:23e768cddc7d draft
Initial commit of micomplete
author | lionelguy |
---|---|
date | Thu, 17 Oct 2013 08:39:52 -0400 |
parents | |
children | 609b98cf0f9f |
files | micomplete.pl micomplete.xml micomplete_hmms.loc.sample tool_dependencies.xml |
diffstat | 4 files changed, 298 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/micomplete.pl Thu Oct 17 08:39:52 2013 -0400 @@ -0,0 +1,193 @@ +#!/usr/bin/perl -w + +=head1 SYNOPSIS + +micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome. + +=head1 USAGE + +micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads] +micomplete.pl --help + +=head1 INPUT + +=head2 -h|--hmm + +A single file containing the HMM profiles against which the proteome will be run. + +=head2 -p|--proteome + +A fasta file containing all protein sequences in a genome. + +=head2 [-c|--cut-off] + +The evalue cut-off to accept the presence of a HMM. 1e-10 by default. + +=head2 [-w|--weights] + +Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. + +=head2 [-e|--evalue] + +Shows evalue to the best hit, instead of presence/absence. + +=head2 [-s|--save-result-tab] + +Save the result per marker + +=head2 [-r|--save-hmm-tab] + +Save hmmsearch tabular result for this file. + +=head2 [-a|--save-hmm-alignments] + +Save hmmsearch tabular result for this file. + +=head2 [-t|--threads] + +Number of threads to use in the hmmsearch. 15 by default. + +=head2 --help + +Displays a help message. + +=head1 OUTPUT + +A completeness report expressed in fraction, and a list of all hmms identified as positive + +=head1 AUTHOR + +Lionel Guy (lionel.guy@icm.uu.se) + +=head1 DATE + +Mon Oct 14 14:09:46 CEST 2013 + +=cut + +# libraries +use strict; +use Pod::Usage; +use Getopt::Long; +# For safe use of tempfiles +use File::Temp qw/ tempfile tempdir /; +use File::Copy qw/ cp /; + +# Options +my ($hmm_file, $fasta_in, $weights_file, $display_eval); +my $cutoff = 1e-10; +my ($save_tab_file, $save_aln_file, $save_res_tab); +my $nthreads = 15; +my $help; +GetOptions( + 'h|hmm=s' => \$hmm_file, + 'p|proteome=s' => \$fasta_in, + 'c|cut-off=s' => \$cutoff, + 'w|weights=s' => \$weights_file, + 'e|evalue' => \$display_eval, + 'r|save-hmm-tab=s' => \$save_tab_file, + 'a|save-hmm-alignments=s' => \$save_aln_file, + 's|save-result-tab=s' => \$save_res_tab, + 't|threads=s' => \$nthreads, + 'help' => \$help, +); + +# Tempfile if not saving results +my $template = 'tempXXXXXXXX'; +my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1); +my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1); + +# Check options +pod2usage(-verbose => 1) if $help; +pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", + -exitstatus => 2 ) unless ($hmm_file); +pod2usage(-message => "No proteome file (-p|--proteome) defined\n", + -exitstatus => 2 ) unless ($fasta_in); + +# Parse eventual weights +my %hmms; +if ($weights_file){ + open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n"; + while (<WEIGHTS>){ + next if /^#/; + my ($name, $weight) = split; + $hmms{$name} = $weight; + } +} + +# Parse HMM, lists names +open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n"; +while (<HMMS>){ + next unless /^NAME/; + my ($bla, $name) = split; + if ($weights_file) { + die "HMM $name doesn't have weight defined" unless $hmms{$name}; + } else { + $hmms{$name}++; + } +} + +# Get sum of weights +my $total; +foreach (values %hmms) { + $total += $_; +} +#print STDERR "Sum of weights: $total\n"; + +# Run hmmsearch +my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads "; +$cmd .= "--noali " unless $save_aln_file; +$cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file"; +system($cmd) == 0 or die ("Command $cmd failed: $?"); + +# Read HMMER output. Assumes the first hit is always the best +open HMMER, '<', $tab_file or die $!; +my %seen; +my $sum; +while (<HMMER>){ + next if /^#/; + my ($gid, $dash, $hname, $hid, $evalue) = split; + #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid); + die ("No gid at $_\n") unless $gid; + die ("HMM not seen in the hmm file") unless $hmms{$hname}; + # Skip if not first hit, or if evalue under cut-off + unless ($seen{$hname} || $evalue > $cutoff){ + $sum += $hmms{$hname}; + $seen{$hname} = $evalue; + } +} + +# Save results if required +cp($tab_file, $save_tab_file) if ($save_tab_file); +cp($aln_file, $save_aln_file) if ($save_aln_file); + +# Print results +printf "Completeness: %.4f", $sum/$total; +print "\nN markers found: ", scalar(keys %seen), " out of ", + scalar(keys %hmms), "\n"; +if ($save_res_tab){ + open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n"; + print TAB "#marker\t"; + if ($display_eval) { + print TAB "evalue\n"; + } else { + print TAB "present\n"; + } + foreach my $hmm (sort keys %hmms){ + print TAB "$hmm\t"; + if ($seen{$hmm}){ + if ($display_eval){ + print TAB "$seen{$hmm}\n"; + } else { + print TAB "1\n"; + } + } else { + if ($display_eval){ + print TAB "NA\n"; + } else { + print TAB "0\n"; + } + } + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/micomplete.xml Thu Oct 17 08:39:52 2013 -0400 @@ -0,0 +1,90 @@ +<tool id="micomplete" name="micomplete" version="0.1"> + <description>Using hmm profiles of conserved proteins, reports how much complete a (single-cell amplified) genome is</description> + <requirements> + <requirement type="package" version="3.1b1">hmmer</requirement> + </requirements> + <command interpreter="perl">micomplete.pl + #if $hmm_source.hmm_source_type == "cached" + --hmm "${ hmm_source.profiles.fields.hmms }" + --weights "${ hmm_source.profiles.fields.weights }" + #else + --hmm "${ hmm_source.own_profiles }" + #if $hmm_source.weights.use_weights + --weights "${ hmm_source.weights.own_weights }" + #end if + #end if + --proteome $proteome + --cut-off $cutoff + #if $evalue + --evalue + #end if + #if $save_hmm_tab + --save-hmm-tab $hmm_tab + #end if + #if $save_hmm_alignments + --save-hmm-alignments $hmm_alignments + #end if + #if $save_res_tab + --save-result-tab $res_tab + #end if + --threads $threads > $output + </command> + <inputs> + <param name="proteome" type="data" format="fasta" label="Proteome file" /> + <conditional name="hmm_source"> + <param name="hmm_source_type" type="select" label="HMM file from history or from built-in profile?"> + <option value="cached">Use a built-in profile</option> + <option value="history">Use a profile from the history</option> + </param> + <when value="cached"> + <param name="profiles" type="select" label="Select a reference set"> + <options from_data_table="micomplete_hmms"> + <filter type="sort_by" column="2" /> + </options> + </param> + </when> + <when value="history"> + <param name="own_profiles" type="data" format="text" label="Select a HMM set in the history" /> + <conditional name="weights"> + <param name="use_weights" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Use weights?" /> + <when value="True"> + <param name="own_weights" type="data" format="tabular" label="Select a weight file (optional)" help="There should be exactly one weight for each profile in the profiles file"/> + </when> + </conditional> + </when> + </conditional> + <param name="cutoff" type="float" value="1e-10" label="E-value cutoff to include hits" /> + <param name="evalue" type="boolean" checked="true" truevalue="True" falsevalue="False" label="Show E-values on the output instead of 0/1?" /> + <param name="save_res_tab" type="boolean" checked="true" truevalue="True" falsevalue="False" label="Save per marker result tabular file?" /> + <param name="save_hmm_tab" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Save HMM tab result file?" /> + <param name="save_hmm_alignments" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Save HMM alignments result file?" /> + <param name="threads" type="integer" value="15" min="1" label="Number of threads to use" /> + </inputs> + <outputs> + <data format="txt" name="output" label="Completeness report" /> + <data format="tabular" name="res_tab" label="HMM report, tabular format"> + <filter>save_res_tab is True</filter> + </data> + <data format="tabular" name="hmm_tab" label="HMM report, tabular format"> + <filter>save_hmm_tab is True</filter> + </data> + <data format="txt" name="hmm_alignments" label="HMM alignments"> + <filter>save_hmm_alignments is True</filter> + </data> + </outputs> + <help> +***What it does*** + +Micomplete estimates the level of completion of a genome. This is particularly useful for single-cell amplified genomes. It aligns a proteome to a set of "markers" or conserved proteins, represented by a collection of HMM profiles, possibly with weights. It uses hmmsearch (http://hmmer.janelia.org/, see for example S.R Eddy, PLOS Comp Biol 2011, 7:e1002195) to align proteins + +Weights are used to alleviate the effect of having/not having a specific contig which would contain many markers that are generally clustering together. Typically, ribosomal proteins, often used are markers, are often organized in operons. Thus, the impact of having/not having these operons is very important, and should be downplayed to get a more correct estimate. + +**License** + +This wrapper is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/micomplete_hmms.loc.sample Thu Oct 17 08:39:52 2013 -0400 @@ -0,0 +1,9 @@ +#This is a sample file distributed with Galaxy that enables tools to use +#a directory of hmm profiles and their weights to calculate completeness. +#The micomplete_hmms.loc has the following format (white spaces are TABs) +# +#<set id> <description> <hmm_file_path> <hmm_weights_path> +# +#So, for example for Arch162 (162 conserved markers in archaea): +#Arch162 162 markers for Archaea /seq/galaxy_datatables/micomplete/Arch162.hmm /seq/galaxy_datatables/micomplete/Arch162.weights +#Bact139 139 markers for Bacteria /seq/galaxy_datatables/micomplete/Bact139.hmm /seq/galaxy_datatables/micomplete/Bact139.weights
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Oct 17 08:39:52 2013 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="hmmer" version="3.1b1"> + <repository changeset_revision="007c736bf7e8" name="package_hmmer_3_1" owner="lionelguy" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>