comparison micomplete.pl @ 0:23e768cddc7d draft

Initial commit of micomplete
author lionelguy
date Thu, 17 Oct 2013 08:39:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:23e768cddc7d
1 #!/usr/bin/perl -w
2
3 =head1 SYNOPSIS
4
5 micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.
6
7 =head1 USAGE
8
9 micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
10 micomplete.pl --help
11
12 =head1 INPUT
13
14 =head2 -h|--hmm
15
16 A single file containing the HMM profiles against which the proteome will be run.
17
18 =head2 -p|--proteome
19
20 A fasta file containing all protein sequences in a genome.
21
22 =head2 [-c|--cut-off]
23
24 The evalue cut-off to accept the presence of a HMM. 1e-10 by default.
25
26 =head2 [-w|--weights]
27
28 Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths.
29
30 =head2 [-e|--evalue]
31
32 Shows evalue to the best hit, instead of presence/absence.
33
34 =head2 [-s|--save-result-tab]
35
36 Save the result per marker
37
38 =head2 [-r|--save-hmm-tab]
39
40 Save hmmsearch tabular result for this file.
41
42 =head2 [-a|--save-hmm-alignments]
43
44 Save hmmsearch tabular result for this file.
45
46 =head2 [-t|--threads]
47
48 Number of threads to use in the hmmsearch. 15 by default.
49
50 =head2 --help
51
52 Displays a help message.
53
54 =head1 OUTPUT
55
56 A completeness report expressed in fraction, and a list of all hmms identified as positive
57
58 =head1 AUTHOR
59
60 Lionel Guy (lionel.guy@icm.uu.se)
61
62 =head1 DATE
63
64 Mon Oct 14 14:09:46 CEST 2013
65
66 =cut
67
68 # libraries
69 use strict;
70 use Pod::Usage;
71 use Getopt::Long;
72 # For safe use of tempfiles
73 use File::Temp qw/ tempfile tempdir /;
74 use File::Copy qw/ cp /;
75
76 # Options
77 my ($hmm_file, $fasta_in, $weights_file, $display_eval);
78 my $cutoff = 1e-10;
79 my ($save_tab_file, $save_aln_file, $save_res_tab);
80 my $nthreads = 15;
81 my $help;
82 GetOptions(
83 'h|hmm=s' => \$hmm_file,
84 'p|proteome=s' => \$fasta_in,
85 'c|cut-off=s' => \$cutoff,
86 'w|weights=s' => \$weights_file,
87 'e|evalue' => \$display_eval,
88 'r|save-hmm-tab=s' => \$save_tab_file,
89 'a|save-hmm-alignments=s' => \$save_aln_file,
90 's|save-result-tab=s' => \$save_res_tab,
91 't|threads=s' => \$nthreads,
92 'help' => \$help,
93 );
94
95 # Tempfile if not saving results
96 my $template = 'tempXXXXXXXX';
97 my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
98 my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);
99
100 # Check options
101 pod2usage(-verbose => 1) if $help;
102 pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n",
103 -exitstatus => 2 ) unless ($hmm_file);
104 pod2usage(-message => "No proteome file (-p|--proteome) defined\n",
105 -exitstatus => 2 ) unless ($fasta_in);
106
107 # Parse eventual weights
108 my %hmms;
109 if ($weights_file){
110 open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
111 while (<WEIGHTS>){
112 next if /^#/;
113 my ($name, $weight) = split;
114 $hmms{$name} = $weight;
115 }
116 }
117
118 # Parse HMM, lists names
119 open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
120 while (<HMMS>){
121 next unless /^NAME/;
122 my ($bla, $name) = split;
123 if ($weights_file) {
124 die "HMM $name doesn't have weight defined" unless $hmms{$name};
125 } else {
126 $hmms{$name}++;
127 }
128 }
129
130 # Get sum of weights
131 my $total;
132 foreach (values %hmms) {
133 $total += $_;
134 }
135 #print STDERR "Sum of weights: $total\n";
136
137 # Run hmmsearch
138 my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
139 $cmd .= "--noali " unless $save_aln_file;
140 $cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
141 system($cmd) == 0 or die ("Command $cmd failed: $?");
142
143 # Read HMMER output. Assumes the first hit is always the best
144 open HMMER, '<', $tab_file or die $!;
145 my %seen;
146 my $sum;
147 while (<HMMER>){
148 next if /^#/;
149 my ($gid, $dash, $hname, $hid, $evalue) = split;
150 #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
151 die ("No gid at $_\n") unless $gid;
152 die ("HMM not seen in the hmm file") unless $hmms{$hname};
153 # Skip if not first hit, or if evalue under cut-off
154 unless ($seen{$hname} || $evalue > $cutoff){
155 $sum += $hmms{$hname};
156 $seen{$hname} = $evalue;
157 }
158 }
159
160 # Save results if required
161 cp($tab_file, $save_tab_file) if ($save_tab_file);
162 cp($aln_file, $save_aln_file) if ($save_aln_file);
163
164 # Print results
165 printf "Completeness: %.4f", $sum/$total;
166 print "\nN markers found: ", scalar(keys %seen), " out of ",
167 scalar(keys %hmms), "\n";
168 if ($save_res_tab){
169 open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
170 print TAB "#marker\t";
171 if ($display_eval) {
172 print TAB "evalue\n";
173 } else {
174 print TAB "present\n";
175 }
176 foreach my $hmm (sort keys %hmms){
177 print TAB "$hmm\t";
178 if ($seen{$hmm}){
179 if ($display_eval){
180 print TAB "$seen{$hmm}\n";
181 } else {
182 print TAB "1\n";
183 }
184 } else {
185 if ($display_eval){
186 print TAB "NA\n";
187 } else {
188 print TAB "0\n";
189 }
190 }
191 }
192 }
193