Mercurial > repos > nml > tree_relabeler
changeset 0:02bc0d7d40b5 draft
planemo upload for repository https://github.com/phac-nml/galaxy_tools/blob/master/tools/tree_relabeler commit 974af0d5e7ccfcbd47284eb85a5f593d5e48daf8
author | nml |
---|---|
date | Mon, 29 Apr 2019 11:18:32 -0400 |
parents | |
children | 267858c28a34 |
files | nml_tree_relabeler.pl test-data/phylogeneticTree.newick.nhx test-data/results test-data/tabs.txt tree_relabeler.xml |
diffstat | 5 files changed, 479 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nml_tree_relabeler.pl Mon Apr 29 11:18:32 2019 -0400 @@ -0,0 +1,408 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Bio::TreeIO; +use Bio::Tree::Tree; +use IO::String; +use Getopt::Long; +use Pod::Usage; + +my ($treefile, $tabfile, $delim, $outfile, $template, $help, $replace, $tabline, $man ); + +GetOptions( + 'i|treefile=s' => \$treefile, + 't|tabfile=s' => \$tabfile, + 'o|outfile=s' => \$outfile, + 'd|delim:s' => \$delim, + 'p|print-template=s' => \$template, + 'r|replace' => \$replace, + 'h|help' => \$help, + 'm|man' => \$man +); + +if ($help){ + pod2usage(-verbose => 99, + -sections => "SYNOPSIS|OPTIONS AND ARGUMENTS|DESCRIPTION|DIAGNOSTICS"); +} elsif ($man){ + pod2usage(-verbose => 2); +} elsif (!$treefile) { + pod2usage(-msg => "**Tree file is required.**\n", + -exitval => 2, + -verbose => 1, + -output => \*STDERR); +} elsif ( !(($template) || ($tabfile && $outfile)) ){ + pod2usage(-msg => "**Either select a template file, or a tab file and outfile**\n", + -exitval => 2, + -verbose => 1, + -output => \*STDERR); +} elsif ( $treefile && $template) { + generate_template(); +} elsif ( $treefile && $tabfile && $outfile ){ + relabel_tree(); +} + +sub generate_template +{ + check_treefile(); + print_template(); +} + +sub relabel_tree +{ + my %new_labels; + check_delimiter(); + %new_labels = %{load_tabfile()}; + check_treefile(); + write_treefile(\%new_labels); +} + +sub print_template +{ + open my $tempout, '>', $template or die "Could not open file: $!"; + print $tempout "#label\n"; + + my $treein = Bio::TreeIO->new( + -format => "newick", + -file => "$treefile" + ); + + while(my $t = $treein->next_tree) + { + my @nodes = $t->get_nodes; + + if (scalar(@nodes) <= 1) + { + print STDERR "Tree is not in newick format.\n"; + exit(2); + } + else + { + foreach my $node ($t->get_leaf_nodes) + { + print $tempout $node->id,"\n"; + } + } + } + close ($tempout); +} + +sub check_delimiter +{ + if (!($delim)){ + $delim = ' '; + } else{ + my $delimlen = length $delim; + # delim length less than 1 indicates empty string. + if ($delimlen < 1){ + $delim = ' '; + }elsif ($delim =~ /[\(\)\;\,\:]/){ + print STDERR "Delimiters cannot be Newick reserved characters '(),;:'.\n"; + exit(1); + } + } +} + +sub load_tabfile +{ + my %new_labels; + if (!(-e $tabfile)){ + # exit if error in tab file + print STDERR "Error opening tab file.\n"; + exit(1); + } + + open (my $tabin, '<', $tabfile); + + # append the > to the front of the outfile string + # $outfile = '>'.$outfile; + + # go through tab file to add new labels to a hash file + while ($tabline = <$tabin>){ + # skip the first row if it starts with a # + next if $tabline =~ s/^#//; + $tabline =~ s/\r//g; + chomp $tabline; + + if ($tabline =~ /[\(\)\;\,\:]/){ + print STDERR "New labels cannot contain Newick reserved characters '(),;:'.\n"; + exit(1); + } + + my @splits = split("\t", $tabline); + + # Check that the tab file has more than one column + my $num_cols = scalar @splits; + if ($num_cols <= 1){ + # exit if one column or less; no new info to add to tree/error with tab layout. + print STDERR "Tab file does not contain new labels.\n"; + exit(1); + } + # set the hash label to the first value in a row, is the original tip label. + my $label = $splits[0]; + # If user chose find and replace instead, get rid of the first value. + shift @splits if ($replace); + # join all values from @split into one string with delim separating them + my $new_info = join($delim, @splits); + # add the new info to the hash + $new_labels{$label} = $new_info; + } + + close ($tabin); + + return \%new_labels; +} + +sub check_treefile +{ + if (!(-e $treefile)){ + # exit if error in tree file + print STDERR "Error opening tree file.\n"; + exit(1); + } + + + # open tree file to check format + if (!(-e $treefile)){ + # exit if error in tab file + print STDERR "Error opening tree file.\n"; + exit(1); + } + if (-z $treefile){ + print STDERR "Tree file is empty.\n"; + exit(1); + } + my $linecount = 0; + open (my $treein, '<', $treefile); + my $line = <$treein>; + my $nextline = <$treein>; + if (defined $nextline){ + print STDERR "Tree is not in newick format. More than one line\n"; + exit(2); + } + close ($treein); + + my @chars = split("",$line); + my $lastelem = @chars-2; + my $bracketcount = 0; + # look for non-spaces at end of line + while($chars[$lastelem] eq " "){ + $lastelem--; + } + if ($chars[$lastelem] ne ";"){ + # newick formats end in ; + print STDERR "Tree is not in newick format. Does not end in ; \n"; + exit(2); + } + + foreach my $char (@chars){ + if ($char eq ")"){ + if ($bracketcount == 0){ + # There is a ) before a ( + print STDERR "Tree is not in newick format. Missing a (\n"; + exit(2); + }else { + $bracketcount--; + } + }elsif ($char eq "("){ + $bracketcount++; + } + } + if ($bracketcount != 0){ + # There were not equal number of ( and ) + print STDERR "Tree is not in newick format. Brackets do not match. \n"; + exit(2); + } +} + +sub write_treefile +{ + my $temp = shift; + my %new_labels = %{$temp}; + + # open tree file as a tree + my $treeinput = Bio::TreeIO->new( + -file => "$treefile" + ); + + #create output tree file + my $treeoutput = Bio::TreeIO->new( + -format => "newick", + -file => ">$outfile" + ); + + while(my $t = $treeinput->next_tree ){ + # check if tree is valid + if (scalar($t->get_nodes)<=1){ + # if tree has only 1 or less nodes, then it's not in correct format + print STDERR "Tree is not in newick format.\n"; + exit(2); + } else{ + foreach my $label(keys %new_labels){ + my $tip = $t->find_node(-id =>$label); + if ($tip){ + # if found, change to the new label + $tip->id($new_labels{$label}); + print $label," found and changed.\n" + } else{ + # if label from tab file is not found, notify the user + print $label," not found.\n"; + } + } + $treeoutput->write_tree($t); + } + } +} + +exit; + +=head1 NAME + +nml_tree_relabeler.pl - Changes the tip labels on a newick formatted tree + +=head1 VERSION + +This documentation refers to nml_tree_relabeler.pl version 0.0.2. + +=head1 SYNOPSIS + + nml_tree_relabeler.pl -i treefile [-t tabfile -o outfile (-d delim) (-r) | -p template] + +=head1 OPTIONS AND ARGUMENTS + +=over + +=item B<-i>, B<--treefile> + +The name of the tree file containing the tree to adjust the tip laels. Only accepts trees in newick format. (required) + +=item B<-t>, B<--tabfile> + +The name of the tab delimited file containing current tip labels and the info to be replaced/added tothe labels. The first column must contain the current tree labels. Must not contain one of the Newick reserved characters '(),:;' (required option) + +=item B<-o>, B<--out> + +The output file. (required option) + +=item B<-d>, B<--delim> + +The character to use to divide the information of the labels. Must not be one of the Newick reserved characters '(),:;' (optional) + +=item B<-r>, B<--replace> + +Replace the tip names. This option will replace the tree tip names with the specified labels, instead of adding them to the tip name. + +=item B<-p>, B<--print-template> + +The name of the output template file. Prints out a template for the tabfile.(required option) + +=item B<-h>, B<--help> + +To display help message + +=item B<-m>, B<--man> + +To display manual + +=back + +=head1 DESCRIPTION + +=over + +nml_tree_relabeler takes a newick format tree file to modify tip labels and a tab-delimited file containing current tip labels and additional information to add to the tips in 2 or more columns. Header row of the tab delimited file must start with a '#'. An example is below: + + #label outbreak year location + orgs1 outbreak1 year1 location1 + orgs2 outbreak2 year2 location2 + +and so on. + +The information in the tab file is inserted into the tree file so the new information will appear on the tip labels. + +Alternatively, nml_tree_relabeler can print out the tip names to a tab-delimited template file. + +=back + +=head1 DIAGNOSTICS + +=over + +=item B<Tree file, tab file, and output file are required> + +Use the proper command line arguments (-i, -t, -o respectively) to add the filenames of the tree file, tab file, and output file. + +=item B<Tree file is required> + +Use the -i command line argument to add the tree file. + +=itemB<Either select a template file, or a tab file and outfile> + +Use the proper command line arguments to either add a template file (-p) to print a tab template, or to add a tab file and an output file (-t, -o respectively) to relabel a tree. + +=item B<Label not found> + +A warning that a label provided in the tab file was not found in the tree file. Relabeling continues. + +=item B<Error opening tab/tree file> + +An error occured while opening the tab/tree file, please check path/file. + +=item B<Tree is not in newick format> + +The tree file does not appear to be in newick format. Please check file and convert if necessary. + +=item B<Tab file does not contain new labels> + +The tab file only contains one column and therefore does not have any additional information to add to the tree. Please check the tab file. + +=item B<Delimiter/tabfile cannot contain Newick reserved characters '(),;:' > + +The tab file or delimiter selected contains one of the characters used in the Newick format. This will cause an error when trying to read the tree. Please modify your tab file or select a new delimiter. + +=back + +=head1 CONFIGURATION AND ENVIRONMENT + +=head1 DEPENDENCIES + +=over + +=item use Bio::TreeIO + +=item use Bio::Tree::Tree + +=back + +=head1 INCOMPATIBILITIES + +This script only works for NEWICK formatted trees. All other tree formats are not compatible. + +=head1 AUTHOR + +Jen Cabral, <jencabral@gmail.com> + +=head1 BUGS AND LIMITATIONS + +There are no known bugs in this module. + +Please report problems to Jen Cabral, <jencabral@gmail.com> + +=head1 COPYRIGHT & LICENSE + +Copyright (C) 2015 by NML + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/> + +=cut \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/phylogeneticTree.newick.nhx Mon Apr 29 11:18:32 2019 -0400 @@ -0,0 +1,1 @@ +(reference:0.29121884);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results Mon Apr 29 11:18:32 2019 -0400 @@ -0,0 +1,1 @@ +(reference outbreak1 year1 location1:0.29121884); \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tabs.txt Mon Apr 29 11:18:32 2019 -0400 @@ -0,0 +1,1 @@ +reference outbreak1 year1 location1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tree_relabeler.xml Mon Apr 29 11:18:32 2019 -0400 @@ -0,0 +1,68 @@ +<tool id="tree_relabel" name="Tree Relabeler" version="1.0.0"> + <description>Relabels the tips of a newick formatted tree.</description> + + <requirements> + <requirement type="package" version="1.7.2">perl-bioperl</requirement> + <requirement type="package" version="2.49">perl-getopt-long</requirement> + </requirements> + + <stdio> + <exit_code range="1" level="fatal" description="Error opening a file."/> + <exit_code range="2" level="fatal" description="Error with tree format."/> + </stdio> + + <command interpreter="perl"> + nml_tree_relabeler.pl -i $treefile -t $tabfile -o $output -d $delim + </command> + + <inputs> + <param name="treefile" type="data" format="txt" label="The newick formated tree file to be relabeled:" optional="false"/> + <param name="tabfile" type="data" format="txt" label="The tab separated file containing the current labels and the info to be added to the labels" optional="false"/> + <param name="delim" type="text" label="Delimiter for new tip labels (space, _, etc)" optional="true" help="If left blank, labels of updated tips will be separated by spaces."/> + </inputs> + + <outputs> + <data format="txt" name="output"/> + </outputs> + + <tests> + <test> + <param name="treefile" value="phylogeneticTree.newick.nhx"/> + <param name="tabfile" value="tabs.txt"/> + <output name="output" file="results"/> + </test> + </tests> + <help> + +What it does +============ + +This tool provides a means for relabeling the tips of a newick formatted tree. + +It takes a newick format tree file to modify tip labels and a tab-delimited file containing current tip labels and additional information to add to the tips in 2 or more columns. + +Usage +===== + +Header row of the tab delimited file must start with a '#'. An example is below: + +- #label outbreak year location +- orgs1 outbreak1 year1 location1 +- orgs2 outbreak2 year2 location2 + +and so on. + +The information in the tab file is inserted into the tree file so the new information will appear on the tip labels. + + +Input +===== + +Newick format tree + +Tab delimted file + +Desired delimiter for updated tip labels + + </help> +</tool>