Mercurial > repos > lionelguy > micomplete
comparison micomplete.pl @ 0:23e768cddc7d draft
Initial commit of micomplete
author | lionelguy |
---|---|
date | Thu, 17 Oct 2013 08:39:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:23e768cddc7d |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 =head1 SYNOPSIS | |
4 | |
5 micomplete.pl - Given a series of HMM profiles and a query proteome, and optionally, weights for the profiles, returns the completeness of a genome. | |
6 | |
7 =head1 USAGE | |
8 | |
9 micomplete.pl -h|--hmm_profiles -p|--proteome [-w|--weights] [-e|--evalue] [-s|--save-result-tab] [-r|--save-hmm-results] [-t|--threads] | |
10 micomplete.pl --help | |
11 | |
12 =head1 INPUT | |
13 | |
14 =head2 -h|--hmm | |
15 | |
16 A single file containing the HMM profiles against which the proteome will be run. | |
17 | |
18 =head2 -p|--proteome | |
19 | |
20 A fasta file containing all protein sequences in a genome. | |
21 | |
22 =head2 [-c|--cut-off] | |
23 | |
24 The evalue cut-off to accept the presence of a HMM. 1e-10 by default. | |
25 | |
26 =head2 [-w|--weights] | |
27 | |
28 Optionally, the weights to apply to the HMMs, for the completeness report. A tab file containing (i) a column with names corresponding to the names in the hmm profiles and (ii) a column with with the weighths. | |
29 | |
30 =head2 [-e|--evalue] | |
31 | |
32 Shows evalue to the best hit, instead of presence/absence. | |
33 | |
34 =head2 [-s|--save-result-tab] | |
35 | |
36 Save the result per marker | |
37 | |
38 =head2 [-r|--save-hmm-tab] | |
39 | |
40 Save hmmsearch tabular result for this file. | |
41 | |
42 =head2 [-a|--save-hmm-alignments] | |
43 | |
44 Save hmmsearch tabular result for this file. | |
45 | |
46 =head2 [-t|--threads] | |
47 | |
48 Number of threads to use in the hmmsearch. 15 by default. | |
49 | |
50 =head2 --help | |
51 | |
52 Displays a help message. | |
53 | |
54 =head1 OUTPUT | |
55 | |
56 A completeness report expressed in fraction, and a list of all hmms identified as positive | |
57 | |
58 =head1 AUTHOR | |
59 | |
60 Lionel Guy (lionel.guy@icm.uu.se) | |
61 | |
62 =head1 DATE | |
63 | |
64 Mon Oct 14 14:09:46 CEST 2013 | |
65 | |
66 =cut | |
67 | |
68 # libraries | |
69 use strict; | |
70 use Pod::Usage; | |
71 use Getopt::Long; | |
72 # For safe use of tempfiles | |
73 use File::Temp qw/ tempfile tempdir /; | |
74 use File::Copy qw/ cp /; | |
75 | |
76 # Options | |
77 my ($hmm_file, $fasta_in, $weights_file, $display_eval); | |
78 my $cutoff = 1e-10; | |
79 my ($save_tab_file, $save_aln_file, $save_res_tab); | |
80 my $nthreads = 15; | |
81 my $help; | |
82 GetOptions( | |
83 'h|hmm=s' => \$hmm_file, | |
84 'p|proteome=s' => \$fasta_in, | |
85 'c|cut-off=s' => \$cutoff, | |
86 'w|weights=s' => \$weights_file, | |
87 'e|evalue' => \$display_eval, | |
88 'r|save-hmm-tab=s' => \$save_tab_file, | |
89 'a|save-hmm-alignments=s' => \$save_aln_file, | |
90 's|save-result-tab=s' => \$save_res_tab, | |
91 't|threads=s' => \$nthreads, | |
92 'help' => \$help, | |
93 ); | |
94 | |
95 # Tempfile if not saving results | |
96 my $template = 'tempXXXXXXXX'; | |
97 my ($fh_tab, $tab_file) = tempfile($template, DIR => '.', UNLINK => 1); | |
98 my ($fh_aln, $aln_file) = tempfile($template, DIR => '.', UNLINK => 1); | |
99 | |
100 # Check options | |
101 pod2usage(-verbose => 1) if $help; | |
102 pod2usage(-message => "No hmm profile file (-h|--hmm) defined\n", | |
103 -exitstatus => 2 ) unless ($hmm_file); | |
104 pod2usage(-message => "No proteome file (-p|--proteome) defined\n", | |
105 -exitstatus => 2 ) unless ($fasta_in); | |
106 | |
107 # Parse eventual weights | |
108 my %hmms; | |
109 if ($weights_file){ | |
110 open WEIGHTS, '<', $weights_file or die "Cannot open $weights_file\n"; | |
111 while (<WEIGHTS>){ | |
112 next if /^#/; | |
113 my ($name, $weight) = split; | |
114 $hmms{$name} = $weight; | |
115 } | |
116 } | |
117 | |
118 # Parse HMM, lists names | |
119 open HMMS, '<', $hmm_file or die "Cannot open $hmm_file\n"; | |
120 while (<HMMS>){ | |
121 next unless /^NAME/; | |
122 my ($bla, $name) = split; | |
123 if ($weights_file) { | |
124 die "HMM $name doesn't have weight defined" unless $hmms{$name}; | |
125 } else { | |
126 $hmms{$name}++; | |
127 } | |
128 } | |
129 | |
130 # Get sum of weights | |
131 my $total; | |
132 foreach (values %hmms) { | |
133 $total += $_; | |
134 } | |
135 #print STDERR "Sum of weights: $total\n"; | |
136 | |
137 # Run hmmsearch | |
138 my $cmd = "hmmsearch --tblout $tab_file --cpu $nthreads "; | |
139 $cmd .= "--noali " unless $save_aln_file; | |
140 $cmd .= "-E $cutoff $hmm_file $fasta_in > $aln_file"; | |
141 system($cmd) == 0 or die ("Command $cmd failed: $?"); | |
142 | |
143 # Read HMMER output. Assumes the first hit is always the best | |
144 open HMMER, '<', $tab_file or die $!; | |
145 my %seen; | |
146 my $sum; | |
147 while (<HMMER>){ | |
148 next if /^#/; | |
149 my ($gid, $dash, $hname, $hid, $evalue) = split; | |
150 #my ($bla, $gi, $ref, $pid) = split(/\|/, $gid); | |
151 die ("No gid at $_\n") unless $gid; | |
152 die ("HMM not seen in the hmm file") unless $hmms{$hname}; | |
153 # Skip if not first hit, or if evalue under cut-off | |
154 unless ($seen{$hname} || $evalue > $cutoff){ | |
155 $sum += $hmms{$hname}; | |
156 $seen{$hname} = $evalue; | |
157 } | |
158 } | |
159 | |
160 # Save results if required | |
161 cp($tab_file, $save_tab_file) if ($save_tab_file); | |
162 cp($aln_file, $save_aln_file) if ($save_aln_file); | |
163 | |
164 # Print results | |
165 printf "Completeness: %.4f", $sum/$total; | |
166 print "\nN markers found: ", scalar(keys %seen), " out of ", | |
167 scalar(keys %hmms), "\n"; | |
168 if ($save_res_tab){ | |
169 open TAB, '>', $save_res_tab or die "Could not open $save_res_tab: $?\n"; | |
170 print TAB "#marker\t"; | |
171 if ($display_eval) { | |
172 print TAB "evalue\n"; | |
173 } else { | |
174 print TAB "present\n"; | |
175 } | |
176 foreach my $hmm (sort keys %hmms){ | |
177 print TAB "$hmm\t"; | |
178 if ($seen{$hmm}){ | |
179 if ($display_eval){ | |
180 print TAB "$seen{$hmm}\n"; | |
181 } else { | |
182 print TAB "1\n"; | |
183 } | |
184 } else { | |
185 if ($display_eval){ | |
186 print TAB "NA\n"; | |
187 } else { | |
188 print TAB "0\n"; | |
189 } | |
190 } | |
191 } | |
192 } | |
193 |