Mercurial > repos > lionelguy > micomplete
view micomplete.pl @ 0:23e768cddc7d draft
Initial commit of micomplete
author | lionelguy |
---|---|
date | Thu, 17 Oct 2013 08:39:52 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w =head1 SYNOPSIS micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome. =head1 USAGE micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads] micomplete.pl --help =head1 INPUT =head2 -h|--hmm A single file containing the HMM profiles against which the proteome will be run. =head2 -p|--proteome A fasta file containing all protein sequences in a genome. =head2 [-c|--cut-off] The evalue cut-off to accept the presence of a HMM. 1e-10 by default. =head2 [-w|--weights] Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. =head2 [-e|--evalue] Shows evalue to the best hit, instead of presence/absence. =head2 [-s|--save-result-tab] Save the result per marker =head2 [-r|--save-hmm-tab] Save hmmsearch tabular result for this file. =head2 [-a|--save-hmm-alignments] Save hmmsearch tabular result for this file. =head2 [-t|--threads] Number of threads to use in the hmmsearch. 15 by default. =head2 --help Displays a help message. =head1 OUTPUT A completeness report expressed in fraction, and a list of all hmms identified as positive =head1 AUTHOR Lionel Guy (lionel.guy@icm.uu.se) =head1 DATE Mon Oct 14 14:09:46 CEST 2013 =cut # libraries use strict; use Pod::Usage; use Getopt::Long; # For safe use of tempfiles use File::Temp qw/ tempfile tempdir /; use File::Copy qw/ cp /; # Options my ($hmm_file, $fasta_in, $weights_file, $display_eval); my $cutoff = 1e-10; my ($save_tab_file, $save_aln_file, $save_res_tab); my $nthreads = 15; my $help; GetOptions( 'h|hmm=s' => \$hmm_file, 'p|proteome=s' => \$fasta_in, 'c|cut-off=s' => \$cutoff, 'w|weights=s' => \$weights_file, 'e|evalue' => \$display_eval, 'r|save-hmm-tab=s' => \$save_tab_file, 'a|save-hmm-alignments=s' => \$save_aln_file, 's|save-result-tab=s' => \$save_res_tab, 't|threads=s' => \$nthreads, 'help' => \$help, ); # Tempfile if not saving results my $template = 'tempXXXXXXXX'; my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1); my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1); # Check options pod2usage(-verbose => 1) if $help; pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", -exitstatus => 2 ) unless ($hmm_file); pod2usage(-message => "No proteome file (-p|--proteome) defined\n", -exitstatus => 2 ) unless ($fasta_in); # Parse eventual weights my %hmms; if ($weights_file){ open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n"; while (<WEIGHTS>){ next if /^#/; my ($name, $weight) = split; $hmms{$name} = $weight; } } # Parse HMM, lists names open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n"; while (<HMMS>){ next unless /^NAME/; my ($bla, $name) = split; if ($weights_file) { die "HMM $name doesn't have weight defined" unless $hmms{$name}; } else { $hmms{$name}++; } } # Get sum of weights my $total; foreach (values %hmms) { $total += $_; } #print STDERR "Sum of weights: $total\n"; # Run hmmsearch my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads "; $cmd .= "--noali " unless $save_aln_file; $cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file"; system($cmd) == 0 or die ("Command $cmd failed: $?"); # Read HMMER output. Assumes the first hit is always the best open HMMER, '<', $tab_file or die $!; my %seen; my $sum; while (<HMMER>){ next if /^#/; my ($gid, $dash, $hname, $hid, $evalue) = split; #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid); die ("No gid at $_\n") unless $gid; die ("HMM not seen in the hmm file") unless $hmms{$hname}; # Skip if not first hit, or if evalue under cut-off unless ($seen{$hname} || $evalue > $cutoff){ $sum += $hmms{$hname}; $seen{$hname} = $evalue; } } # Save results if required cp($tab_file, $save_tab_file) if ($save_tab_file); cp($aln_file, $save_aln_file) if ($save_aln_file); # Print results printf "Completeness: %.4f", $sum/$total; print "\nN markers found: ", scalar(keys %seen), " out of ", scalar(keys %hmms), "\n"; if ($save_res_tab){ open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n"; print TAB "#marker\t"; if ($display_eval) { print TAB "evalue\n"; } else { print TAB "present\n"; } foreach my $hmm (sort keys %hmms){ print TAB "$hmm\t"; if ($seen{$hmm}){ if ($display_eval){ print TAB "$seen{$hmm}\n"; } else { print TAB "1\n"; } } else { if ($display_eval){ print TAB "NA\n"; } else { print TAB "0\n"; } } } }