comparison COG/bac-genomics-scripts/prot_finder/prot_binary_matrix.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 C<prot_binary_matrix.pl> - create a presence/absence matrix from
12 C<prot_finder.pl> output
13
14 =head1 SYNOPSIS
15
16 C<perl prot_binary_matrix.pl blast_hits.tsv E<gt> binary_matrix.tsv>
17
18 B<or>
19
20 C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl E<gt> binary_matrix.tsv>
21
22 =head1 DESCRIPTION
23
24 This script is intended to create a presence/absence matrix from the
25 significant C<prot_finder.pl>
26 L<B<BLASTP>|http://blast.ncbi.nlm.nih.gov/Blast.cgi>) hits (or the
27 companion bash pipe C<prot_finder_pipe.sh>). The tab-separated
28 C<prot_finder.pl> output can be given directly via C<STDIN> or as a
29 file. By default a tab-delimited binary presence/absence matrix for
30 query hits per subject organism will be printed to C<STDOUT>. Use
31 option B<-t> to count all query hits per subject organism, not just
32 the binary presence/absence. You can transpose the presence/absence
33 binary matrix with the script C<transpose_matrix.pl> (see its help
34 with B<-h>).
35
36 The resulting matrix can be used to associate the presence/absence
37 data with a phylogenetic tree, e.g. use the Interactive Tree Of Life
38 website (L<B<iTOL>|http://itol.embl.de/>). B<iTOL> likes individual
39 comma-separated input files, thus use options B<-s -c> for this
40 purpose.
41
42 For B<iTOL> the organism names have to have identical names to the
43 leaves of the phylogenetic tree, thus manual adaptation, e.g. in a
44 spreadsheet software, might be needed. B<Careful>, subject organisms
45 without a significant B<BLASTP> hit won't be included in the
46 tab-separated C<prot_finder.pl> result table and hence can't be
47 included by C<prot_binary_matrix.pl>. If needed add them manually to
48 the result matri(x|ces).
49
50 Additionally, you can give the presence/absence binary matrix to
51 C<binary_group_stats.pl> to calculate presence/absence statistics
52 for groups of columns and not simply single columns of the matrix.
53 C<binary_group_stats.pl> also has a comprehensive manual with its
54 option B<-h>.
55
56 =head1 OPTIONS
57
58 =over 20
59
60 =item B<-h>, B<-help>
61
62 Help (perldoc POD)
63
64 =item B<-s>, B<-separate>
65
66 Separate presence/absence files for each query protein printed to
67 the result directory [default without B<-s> = C<STDOUT> matrix for
68 all query proteins combined]
69
70 =item B<-d>=I<str>, B<-dir_result>=I<str>
71
72 Path to result folder, requires option B<-s> [default =
73 './binary_matrix_results']
74
75 =item B<-t>, B<-total>
76
77 Count total occurrences of query proteins, not just binary
78 presence/absence
79
80 =item B<-c>, B<-csv>
81
82 Output matri(x|ces) in comma-separated format (*.csv) instead of
83 tab-delimited format (*.tsv)
84
85 =item B<-l>, B<-locus_tag>
86
87 Use the locus_tag B<prefixes> in the subject_ID column of the
88 C<prot_finder.pl> output (instead of the subject_organism columns) as
89 organism IDs to associate query hits to organisms. The subject_ID
90 column will include locus_tags if they're annotated for a genome
91 (see the L<C<cds_extractor.pl>|/cds_extractor> format description).
92 Useful if the L<C<cds_extractor.pl>|/cds_extractor> output doesn't
93 include strain names for 'o=' in the FASTA IDs, because the prefix
94 of a locus_tag should be unique for a genome (see
95 L<http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>).
96
97 =item B<-v>, B<-version>
98
99 Print version number to C<STDERR>
100
101 =back
102
103 =head1 OUTPUT
104
105 =over 17
106
107 =item C<STDOUT>
108
109 The resulting presence/absence matrix is printed to C<STDOUT>
110 without option B<-s>. Redirect or pipe into another tool as needed.
111
112 =item (F<./binary_matrix_results>)
113
114 Separate query presence/absence files are stored in a result folder
115 with option B<-s>
116
117 =item (F<./binary_matrix_results/query-ID_binary_matrix.(tsv|csv)>)
118
119 Separate query presence/absence files with option B<-s>
120
121 =back
122
123 =head1 EXAMPLES
124
125 =over
126
127 =item C<perl prot_binary_matrix.pl -s -d result_dir -t blast_hits.tsv>
128
129 =back
130
131 B<or>
132
133 =over
134
135 =item C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl -l -c E<gt> binary_matrix.csv>
136
137 =back
138
139 B<or>
140
141 =over
142
143 =item C<mkdir result_dir && ./prot_finder_pipe.sh -q query.faa -s subject.faa -d result_dir -m | tee result_dir/blast_hits.tsv | perl prot_binary_matrix.pl E<gt> binary_matrix.tsv>
144
145 =back
146
147 =head1 VERSION
148
149 0.6 update: 23-11-2015
150 0.1 25-10-2012
151
152 =head1 AUTHOR
153
154 Andreas Leimbach aleimba[at]gmx[dot]de
155
156 =head1 LICENSE
157
158 This program is free software: you can redistribute it and/or modify
159 it under the terms of the GNU General Public License as published by
160 the Free Software Foundation; either version 3 (GPLv3) of the
161 License, or (at your option) any later version.
162
163 This program is distributed in the hope that it will be useful, but
164 WITHOUT ANY WARRANTY; without even the implied warranty of
165 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
166 General Public License for more details.
167
168 You should have received a copy of the GNU General Public License
169 along with this program. If not, see L<http://www.gnu.org/licenses/>.
170
171 =cut
172
173
174 ########
175 # MAIN #
176 ########
177
178 use strict;
179 use warnings;
180 use autodie;
181 use Getopt::Long;
182 use Pod::Usage;
183
184 ### Get options with Getopt::Long
185 my $Opt_Separate; # separate presence/absence files for each query printed to result_dir (default: single presence/absence file for all queries printed to STDOUT)
186 my $Result_Dir; # path to result folder, requires option '-s'; default set below to 'binary_matrix_results'
187 my $Opt_Total; # count total occurrences of query proteins not just presence/absence binary
188 my $Opt_Csv; # output in csv format (default: tsv)
189 my $Opt_Locus_Tag; # use locus_tag prefixes (from subject_ID column, see cds_exractor) instead of subject_organism as ID to count query hits
190 my $VERSION = 0.6;
191 my ($Opt_Version, $Opt_Help);
192 GetOptions ('separate' => \$Opt_Separate,
193 'dir_result=s' => \$Result_Dir,
194 'total' => \$Opt_Total,
195 'csv' => \$Opt_Csv,
196 'locus_tag' => \$Opt_Locus_Tag,
197 'version' => \$Opt_Version,
198 'help|?' => \$Opt_Help)
199 or pod2usage(-verbose => 1, -exitval => 2);
200
201
202
203 ### Run perldoc on POD and set option defaults
204 pod2usage(-verbose => 2) if ($Opt_Help);
205 die "$0 $VERSION\n" if ($Opt_Version);
206 if ($Result_Dir && !$Opt_Separate) {
207 warn "### Warning: Option '-d' given but not its required option '-s', forcing option '-s'!\n";
208 $Opt_Separate = 1;
209 }
210
211 my $Separator = "\t";
212 $Separator = "," if ($Opt_Csv); # optional csv output format
213
214
215
216 ### Check input
217 if (-t STDIN && ! @ARGV) {
218 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
219 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
220 } elsif (!-t STDIN && @ARGV) {
221 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
222 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
223 }
224 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n" if (@ARGV > 1);
225 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
226
227
228
229 ### Create result folder, only for option '-s'
230 if ($Opt_Separate) {
231 $Result_Dir = 'binary_matrix_results' if (!$Result_Dir);
232 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
233 if (-e $Result_Dir) {
234 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
235 } else {
236 mkdir $Result_Dir;
237 }
238 }
239
240
241
242 ### Parse the input from 'prot_finder.pl' to associate organism with query hit
243 my @Queries; # store all query proteins
244 my %Hits; # hash of hash to associate subject_organism/subject_ID with query hit
245
246 while (<>) { # read STDIN or file input
247 chomp;
248 if ($. == 1) { # $. = check only first line of input (works with STDIN and file input)
249 die "\n### Fatal error: Input doesn't have the correct format, it has to be the output of 'prot_finder.pl' with the following header:\n# subject_organism\tsubject_ID\tsubject_gene\tsubject_protein_desc\tquery_ID\tquery_desc\tquery_coverage [%]\tquery_identities [%]\tsubject/hit_coverage [%]\te-value of best HSP\tbit-score of best HSP\n" if (!/# subject_organism\tsubject_ID\tsubject_gene\tsubject_protein_desc\tquery_ID\tquery_desc/);
250 next; # skip header line
251 }
252
253 my @line = split (/\t/, $_); # $line[0] = subject_organism; $line[1] = subject_ID (mostly locus_tag, see cds_extractor); $line[4] = query_ID
254 my $query = $line[4];
255 my $id;
256 if ($Opt_Locus_Tag) { # use subject_ID
257 die "\n### Fatal error: The subject_ID of the following line doesn't look like an NCBI locus tag (see: http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation). Column subject_ID needs to include only locus_tags for option '-l'!\n$_\n" if ($line[1] !~ /^([a-zA-Z][0-9a-zA-Z]{2,11})_[0-9a-zA-Z]+$/); # check if subject_ID is locus_tag ('\w' not used because allows alphanumeric and '_')
258 # excerpt: The locus_tag prefix must be 3-12 alphanumeric characters and the first character may not be a digit. All chromosomes and plasmids of an individual genome must use the exactly same locus_tag prefix followed by an underscore and then an alphanumeric identification number that is unique within the given genome. Other than the single underscore used to separate the prefix from the identification number no other special characters can be used in the locus_tag.
259 $id = $1; # locus_tag prefix, unique for each genome
260 } else { # use subject_organism as ID
261 $id = $line[0];
262 }
263
264 if ($Opt_Total) { # count total occurrences of query proteins
265 $Hits{$id}{$query}++;
266
267 } else { # only binary output (0 or 1)
268 $Hits{$id}{$query} = 1;
269 }
270
271 push (@Queries, $query) if (!grep($_ eq $query, @Queries)); # push each query only once in @Queries
272 }
273
274
275
276 ### Print binary data to a joined or separate (for each query; as needed by iTOL) file(s)
277 if (!$Opt_Separate) { # joined output
278 # print header
279 if ($Opt_Locus_Tag) {
280 print "locus_tag";
281 } else {
282 print "organism";
283 }
284 print "$Separator";
285 print join("$Separator", sort @Queries), "\n";
286
287 # print data to STDOUT
288 foreach my $id (sort keys %Hits) {
289 print "$id";
290 foreach my $query (sort @Queries) {
291 if ($Hits{$id}->{$query}) {
292 print "$Separator", "$Hits{$id}->{$query}";
293 } else {
294 print "$Separator", "0";
295 }
296 }
297 print "\n";
298 }
299
300 } elsif ($Opt_Separate) { # separated output for each query
301 foreach my $query (sort @Queries) {
302 my $file = "$Result_Dir/$query\_binary\_matrix.";
303 if ($Opt_Csv) {
304 $file .= "csv";
305 } else {
306 $file .= "tsv";
307 }
308 open (my $binary_matrix_fh, ">", "$file");
309 foreach my $id (sort keys %Hits) {
310 print $binary_matrix_fh "$id";
311 if ($Hits{$id}->{$query}) {
312 print $binary_matrix_fh "$Separator", "$Hits{$id}->{$query}\n";
313 } else {
314 print $binary_matrix_fh "$Separator", "0\n";
315 }
316 }
317 close $binary_matrix_fh;
318 }
319 }
320
321 exit;
322
323
324 ###############
325 # Subroutines #
326 ###############
327
328 ### Subroutine to empty a directory with user interaction
329 sub empty_dir {
330 my $dir = shift;
331 print STDERR "\nDirectory '$dir' already exists! You can use either option '-d' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
332 my $user_ask = <STDIN>;
333 if ($user_ask =~ /y/i) {
334 unlink glob "$dir/*"; # remove all files in results directory
335 } else {
336 die "\nScript abborted!\n";
337 }
338 return 1;
339 }