0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 =head1 SYNOPSIS
|
|
4
|
|
5 micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome.
|
|
6
|
|
7 =head1 USAGE
|
|
8
|
|
9 micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads]
|
|
10 micomplete.pl --help
|
|
11
|
|
12 =head1 INPUT
|
|
13
|
|
14 =head2 -h|--hmm
|
|
15
|
|
16 A single file containing the HMM profiles against which the proteome will be run.
|
|
17
|
|
18 =head2 -p|--proteome
|
|
19
|
|
20 A fasta file containing all protein sequences in a genome.
|
|
21
|
|
22 =head2 [-c|--cut-off]
|
|
23
|
|
24 The evalue cut-off to accept the presence of a HMM. 1e-10 by default.
|
|
25
|
|
26 =head2 [-w|--weights]
|
|
27
|
|
28 Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths.
|
|
29
|
|
30 =head2 [-e|--evalue]
|
|
31
|
|
32 Shows evalue to the best hit, instead of presence/absence.
|
|
33
|
|
34 =head2 [-s|--save-result-tab]
|
|
35
|
|
36 Save the result per marker
|
|
37
|
|
38 =head2 [-r|--save-hmm-tab]
|
|
39
|
|
40 Save hmmsearch tabular result for this file.
|
|
41
|
|
42 =head2 [-a|--save-hmm-alignments]
|
|
43
|
|
44 Save hmmsearch tabular result for this file.
|
|
45
|
|
46 =head2 [-t|--threads]
|
|
47
|
|
48 Number of threads to use in the hmmsearch. 15 by default.
|
|
49
|
|
50 =head2 --help
|
|
51
|
|
52 Displays a help message.
|
|
53
|
|
54 =head1 OUTPUT
|
|
55
|
|
56 A completeness report expressed in fraction, and a list of all hmms identified as positive
|
|
57
|
|
58 =head1 AUTHOR
|
|
59
|
|
60 Lionel Guy (lionel.guy@icm.uu.se)
|
|
61
|
|
62 =head1 DATE
|
|
63
|
|
64 Mon Oct 14 14:09:46 CEST 2013
|
|
65
|
|
66 =cut
|
|
67
|
|
68 # libraries
|
|
69 use strict;
|
|
70 use Pod::Usage;
|
|
71 use Getopt::Long;
|
|
72 # For safe use of tempfiles
|
|
73 use File::Temp qw/ tempfile tempdir /;
|
|
74 use File::Copy qw/ cp /;
|
|
75
|
|
76 # Options
|
|
77 my ($hmm_file, $fasta_in, $weights_file, $display_eval);
|
|
78 my $cutoff = 1e-10;
|
|
79 my ($save_tab_file, $save_aln_file, $save_res_tab);
|
|
80 my $nthreads = 15;
|
|
81 my $help;
|
|
82 GetOptions(
|
|
83 'h|hmm=s' => \$hmm_file,
|
|
84 'p|proteome=s' => \$fasta_in,
|
|
85 'c|cut-off=s' => \$cutoff,
|
|
86 'w|weights=s' => \$weights_file,
|
|
87 'e|evalue' => \$display_eval,
|
|
88 'r|save-hmm-tab=s' => \$save_tab_file,
|
|
89 'a|save-hmm-alignments=s' => \$save_aln_file,
|
|
90 's|save-result-tab=s' => \$save_res_tab,
|
|
91 't|threads=s' => \$nthreads,
|
|
92 'help' => \$help,
|
|
93 );
|
|
94
|
|
95 # Tempfile if not saving results
|
|
96 my $template = 'tempXXXXXXXX';
|
|
97 my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1);
|
|
98 my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1);
|
|
99
|
|
100 # Check options
|
|
101 pod2usage(-verbose => 1) if $help;
|
|
102 pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n",
|
|
103 -exitstatus => 2 ) unless ($hmm_file);
|
|
104 pod2usage(-message => "No proteome file (-p|--proteome) defined\n",
|
|
105 -exitstatus => 2 ) unless ($fasta_in);
|
|
106
|
|
107 # Parse eventual weights
|
|
108 my %hmms;
|
|
109 if ($weights_file){
|
|
110 open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n";
|
|
111 while (<WEIGHTS>){
|
|
112 next if /^#/;
|
|
113 my ($name, $weight) = split;
|
|
114 $hmms{$name} = $weight;
|
|
115 }
|
|
116 }
|
|
117
|
|
118 # Parse HMM, lists names
|
|
119 open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n";
|
|
120 while (<HMMS>){
|
|
121 next unless /^NAME/;
|
|
122 my ($bla, $name) = split;
|
|
123 if ($weights_file) {
|
|
124 die "HMM $name doesn't have weight defined" unless $hmms{$name};
|
|
125 } else {
|
|
126 $hmms{$name}++;
|
|
127 }
|
|
128 }
|
|
129
|
|
130 # Get sum of weights
|
|
131 my $total;
|
|
132 foreach (values %hmms) {
|
|
133 $total += $_;
|
|
134 }
|
|
135 #print STDERR "Sum of weights: $total\n";
|
|
136
|
|
137 # Run hmmsearch
|
|
138 my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads ";
|
|
139 $cmd .= "--noali " unless $save_aln_file;
|
|
140 $cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file";
|
|
141 system($cmd) == 0 or die ("Command $cmd failed: $?");
|
|
142
|
|
143 # Read HMMER output. Assumes the first hit is always the best
|
|
144 open HMMER, '<', $tab_file or die $!;
|
|
145 my %seen;
|
|
146 my $sum;
|
|
147 while (<HMMER>){
|
|
148 next if /^#/;
|
|
149 my ($gid, $dash, $hname, $hid, $evalue) = split;
|
|
150 #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid);
|
|
151 die ("No gid at $_\n") unless $gid;
|
|
152 die ("HMM not seen in the hmm file") unless $hmms{$hname};
|
|
153 # Skip if not first hit, or if evalue under cut-off
|
|
154 unless ($seen{$hname} || $evalue > $cutoff){
|
|
155 $sum += $hmms{$hname};
|
|
156 $seen{$hname} = $evalue;
|
|
157 }
|
|
158 }
|
|
159
|
|
160 # Save results if required
|
|
161 cp($tab_file, $save_tab_file) if ($save_tab_file);
|
|
162 cp($aln_file, $save_aln_file) if ($save_aln_file);
|
|
163
|
|
164 # Print results
|
|
165 printf "Completeness: %.4f", $sum/$total;
|
|
166 print "\nN markers found: ", scalar(keys %seen), " out of ",
|
|
167 scalar(keys %hmms), "\n";
|
|
168 if ($save_res_tab){
|
|
169 open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n";
|
|
170 print TAB "#marker\t";
|
|
171 if ($display_eval) {
|
|
172 print TAB "evalue\n";
|
|
173 } else {
|
|
174 print TAB "present\n";
|
|
175 }
|
|
176 foreach my $hmm (sort keys %hmms){
|
|
177 print TAB "$hmm\t";
|
|
178 if ($seen{$hmm}){
|
|
179 if ($display_eval){
|
|
180 print TAB "$seen{$hmm}\n";
|
|
181 } else {
|
|
182 print TAB "1\n";
|
|
183 }
|
|
184 } else {
|
|
185 if ($display_eval){
|
|
186 print TAB "NA\n";
|
|
187 } else {
|
|
188 print TAB "0\n";
|
|
189 }
|
|
190 }
|
|
191 }
|
|
192 }
|
|
193
|