comparison nml_tree_relabeler.pl @ 0:02bc0d7d40b5 draft

planemo upload for repository https://github.com/phac-nml/galaxy_tools/blob/master/tools/tree_relabeler commit 974af0d5e7ccfcbd47284eb85a5f593d5e48daf8
author nml
date Mon, 29 Apr 2019 11:18:32 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:02bc0d7d40b5
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use Bio::TreeIO;
6 use Bio::Tree::Tree;
7 use IO::String;
8 use Getopt::Long;
9 use Pod::Usage;
10
11 my ($treefile, $tabfile, $delim, $outfile, $template, $help, $replace, $tabline, $man );
12
13 GetOptions(
14 'i|treefile=s' => \$treefile,
15 't|tabfile=s' => \$tabfile,
16 'o|outfile=s' => \$outfile,
17 'd|delim:s' => \$delim,
18 'p|print-template=s' => \$template,
19 'r|replace' => \$replace,
20 'h|help' => \$help,
21 'm|man' => \$man
22 );
23
24 if ($help){
25 pod2usage(-verbose => 99,
26 -sections => "SYNOPSIS|OPTIONS AND ARGUMENTS|DESCRIPTION|DIAGNOSTICS");
27 } elsif ($man){
28 pod2usage(-verbose => 2);
29 } elsif (!$treefile) {
30 pod2usage(-msg => "**Tree file is required.**\n",
31 -exitval => 2,
32 -verbose => 1,
33 -output => \*STDERR);
34 } elsif ( !(($template) || ($tabfile && $outfile)) ){
35 pod2usage(-msg => "**Either select a template file, or a tab file and outfile**\n",
36 -exitval => 2,
37 -verbose => 1,
38 -output => \*STDERR);
39 } elsif ( $treefile && $template) {
40 generate_template();
41 } elsif ( $treefile && $tabfile && $outfile ){
42 relabel_tree();
43 }
44
45 sub generate_template
46 {
47 check_treefile();
48 print_template();
49 }
50
51 sub relabel_tree
52 {
53 my %new_labels;
54 check_delimiter();
55 %new_labels = %{load_tabfile()};
56 check_treefile();
57 write_treefile(\%new_labels);
58 }
59
60 sub print_template
61 {
62 open my $tempout, '>', $template or die "Could not open file: $!";
63 print $tempout "#label\n";
64
65 my $treein = Bio::TreeIO->new(
66 -format => "newick",
67 -file => "$treefile"
68 );
69
70 while(my $t = $treein->next_tree)
71 {
72 my @nodes = $t->get_nodes;
73
74 if (scalar(@nodes) <= 1)
75 {
76 print STDERR "Tree is not in newick format.\n";
77 exit(2);
78 }
79 else
80 {
81 foreach my $node ($t->get_leaf_nodes)
82 {
83 print $tempout $node->id,"\n";
84 }
85 }
86 }
87 close ($tempout);
88 }
89
90 sub check_delimiter
91 {
92 if (!($delim)){
93 $delim = ' ';
94 } else{
95 my $delimlen = length $delim;
96 # delim length less than 1 indicates empty string.
97 if ($delimlen < 1){
98 $delim = ' ';
99 }elsif ($delim =~ /[\(\)\;\,\:]/){
100 print STDERR "Delimiters cannot be Newick reserved characters '(),;:'.\n";
101 exit(1);
102 }
103 }
104 }
105
106 sub load_tabfile
107 {
108 my %new_labels;
109 if (!(-e $tabfile)){
110 # exit if error in tab file
111 print STDERR "Error opening tab file.\n";
112 exit(1);
113 }
114
115 open (my $tabin, '<', $tabfile);
116
117 # append the > to the front of the outfile string
118 # $outfile = '>'.$outfile;
119
120 # go through tab file to add new labels to a hash file
121 while ($tabline = <$tabin>){
122 # skip the first row if it starts with a #
123 next if $tabline =~ s/^#//;
124 $tabline =~ s/\r//g;
125 chomp $tabline;
126
127 if ($tabline =~ /[\(\)\;\,\:]/){
128 print STDERR "New labels cannot contain Newick reserved characters '(),;:'.\n";
129 exit(1);
130 }
131
132 my @splits = split("\t", $tabline);
133
134 # Check that the tab file has more than one column
135 my $num_cols = scalar @splits;
136 if ($num_cols <= 1){
137 # exit if one column or less; no new info to add to tree/error with tab layout.
138 print STDERR "Tab file does not contain new labels.\n";
139 exit(1);
140 }
141 # set the hash label to the first value in a row, is the original tip label.
142 my $label = $splits[0];
143 # If user chose find and replace instead, get rid of the first value.
144 shift @splits if ($replace);
145 # join all values from @split into one string with delim separating them
146 my $new_info = join($delim, @splits);
147 # add the new info to the hash
148 $new_labels{$label} = $new_info;
149 }
150
151 close ($tabin);
152
153 return \%new_labels;
154 }
155
156 sub check_treefile
157 {
158 if (!(-e $treefile)){
159 # exit if error in tree file
160 print STDERR "Error opening tree file.\n";
161 exit(1);
162 }
163
164
165 # open tree file to check format
166 if (!(-e $treefile)){
167 # exit if error in tab file
168 print STDERR "Error opening tree file.\n";
169 exit(1);
170 }
171 if (-z $treefile){
172 print STDERR "Tree file is empty.\n";
173 exit(1);
174 }
175 my $linecount = 0;
176 open (my $treein, '<', $treefile);
177 my $line = <$treein>;
178 my $nextline = <$treein>;
179 if (defined $nextline){
180 print STDERR "Tree is not in newick format. More than one line\n";
181 exit(2);
182 }
183 close ($treein);
184
185 my @chars = split("",$line);
186 my $lastelem = @chars-2;
187 my $bracketcount = 0;
188 # look for non-spaces at end of line
189 while($chars[$lastelem] eq " "){
190 $lastelem--;
191 }
192 if ($chars[$lastelem] ne ";"){
193 # newick formats end in ;
194 print STDERR "Tree is not in newick format. Does not end in ; \n";
195 exit(2);
196 }
197
198 foreach my $char (@chars){
199 if ($char eq ")"){
200 if ($bracketcount == 0){
201 # There is a ) before a (
202 print STDERR "Tree is not in newick format. Missing a (\n";
203 exit(2);
204 }else {
205 $bracketcount--;
206 }
207 }elsif ($char eq "("){
208 $bracketcount++;
209 }
210 }
211 if ($bracketcount != 0){
212 # There were not equal number of ( and )
213 print STDERR "Tree is not in newick format. Brackets do not match. \n";
214 exit(2);
215 }
216 }
217
218 sub write_treefile
219 {
220 my $temp = shift;
221 my %new_labels = %{$temp};
222
223 # open tree file as a tree
224 my $treeinput = Bio::TreeIO->new(
225 -file => "$treefile"
226 );
227
228 #create output tree file
229 my $treeoutput = Bio::TreeIO->new(
230 -format => "newick",
231 -file => ">$outfile"
232 );
233
234 while(my $t = $treeinput->next_tree ){
235 # check if tree is valid
236 if (scalar($t->get_nodes)<=1){
237 # if tree has only 1 or less nodes, then it's not in correct format
238 print STDERR "Tree is not in newick format.\n";
239 exit(2);
240 } else{
241 foreach my $label(keys %new_labels){
242 my $tip = $t->find_node(-id =>$label);
243 if ($tip){
244 # if found, change to the new label
245 $tip->id($new_labels{$label});
246 print $label," found and changed.\n"
247 } else{
248 # if label from tab file is not found, notify the user
249 print $label," not found.\n";
250 }
251 }
252 $treeoutput->write_tree($t);
253 }
254 }
255 }
256
257 exit;
258
259 =head1 NAME
260
261 nml_tree_relabeler.pl - Changes the tip labels on a newick formatted tree
262
263 =head1 VERSION
264
265 This documentation refers to nml_tree_relabeler.pl version 0.0.2.
266
267 =head1 SYNOPSIS
268
269 nml_tree_relabeler.pl -i treefile [-t tabfile -o outfile (-d delim) (-r) | -p template]
270
271 =head1 OPTIONS AND ARGUMENTS
272
273 =over
274
275 =item B<-i>, B<--treefile>
276
277 The name of the tree file containing the tree to adjust the tip laels. Only accepts trees in newick format. (required)
278
279 =item B<-t>, B<--tabfile>
280
281 The name of the tab delimited file containing current tip labels and the info to be replaced/added tothe labels. The first column must contain the current tree labels. Must not contain one of the Newick reserved characters '(),:;' (required option)
282
283 =item B<-o>, B<--out>
284
285 The output file. (required option)
286
287 =item B<-d>, B<--delim>
288
289 The character to use to divide the information of the labels. Must not be one of the Newick reserved characters '(),:;' (optional)
290
291 =item B<-r>, B<--replace>
292
293 Replace the tip names. This option will replace the tree tip names with the specified labels, instead of adding them to the tip name.
294
295 =item B<-p>, B<--print-template>
296
297 The name of the output template file. Prints out a template for the tabfile.(required option)
298
299 =item B<-h>, B<--help>
300
301 To display help message
302
303 =item B<-m>, B<--man>
304
305 To display manual
306
307 =back
308
309 =head1 DESCRIPTION
310
311 =over
312
313 nml_tree_relabeler takes a newick format tree file to modify tip labels and a tab-delimited file containing current tip labels and additional information to add to the tips in 2 or more columns. Header row of the tab delimited file must start with a '#'. An example is below:
314
315 #label outbreak year location
316 orgs1 outbreak1 year1 location1
317 orgs2 outbreak2 year2 location2
318
319 and so on.
320
321 The information in the tab file is inserted into the tree file so the new information will appear on the tip labels.
322
323 Alternatively, nml_tree_relabeler can print out the tip names to a tab-delimited template file.
324
325 =back
326
327 =head1 DIAGNOSTICS
328
329 =over
330
331 =item B<Tree file, tab file, and output file are required>
332
333 Use the proper command line arguments (-i, -t, -o respectively) to add the filenames of the tree file, tab file, and output file.
334
335 =item B<Tree file is required>
336
337 Use the -i command line argument to add the tree file.
338
339 =itemB<Either select a template file, or a tab file and outfile>
340
341 Use the proper command line arguments to either add a template file (-p) to print a tab template, or to add a tab file and an output file (-t, -o respectively) to relabel a tree.
342
343 =item B<Label not found>
344
345 A warning that a label provided in the tab file was not found in the tree file. Relabeling continues.
346
347 =item B<Error opening tab/tree file>
348
349 An error occured while opening the tab/tree file, please check path/file.
350
351 =item B<Tree is not in newick format>
352
353 The tree file does not appear to be in newick format. Please check file and convert if necessary.
354
355 =item B<Tab file does not contain new labels>
356
357 The tab file only contains one column and therefore does not have any additional information to add to the tree. Please check the tab file.
358
359 =item B<Delimiter/tabfile cannot contain Newick reserved characters '(),;:' >
360
361 The tab file or delimiter selected contains one of the characters used in the Newick format. This will cause an error when trying to read the tree. Please modify your tab file or select a new delimiter.
362
363 =back
364
365 =head1 CONFIGURATION AND ENVIRONMENT
366
367 =head1 DEPENDENCIES
368
369 =over
370
371 =item use Bio::TreeIO
372
373 =item use Bio::Tree::Tree
374
375 =back
376
377 =head1 INCOMPATIBILITIES
378
379 This script only works for NEWICK formatted trees. All other tree formats are not compatible.
380
381 =head1 AUTHOR
382
383 Jen Cabral, <jencabral@gmail.com>
384
385 =head1 BUGS AND LIMITATIONS
386
387 There are no known bugs in this module.
388
389 Please report problems to Jen Cabral, <jencabral@gmail.com>
390
391 =head1 COPYRIGHT & LICENSE
392
393 Copyright (C) 2015 by NML
394
395 This program is free software: you can redistribute it and/or modify
396 it under the terms of the GNU General Public License as published by
397 the Free Software Foundation, either version 3 of the License, or
398 (at your option) any later version.
399
400 This program is distributed in the hope that it will be useful,
401 but WITHOUT ANY WARRANTY; without even the implied warranty of
402 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
403 GNU General Public License for more details.
404
405 You should have received a copy of the GNU General Public License
406 along with this program. If not, see <http://www.gnu.org/licenses/>
407
408 =cut