Mercurial > repos > earlhaminst > ete
diff ete_homology_classifier.py @ 5:817031b8486d draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
author | earlhaminst |
---|---|
date | Thu, 10 May 2018 06:15:17 -0400 |
parents | |
children | f1eca1158f21 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ete_homology_classifier.py Thu May 10 06:15:17 2018 -0400 @@ -0,0 +1,78 @@ +from __future__ import print_function + +import optparse + +from ete3 import PhyloTree + + +def main(): + usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" + parser = optparse.OptionParser(usage=usage) + parser.add_option('--genetree', help='GeneTree in nhx format') + parser.add_option('--out_format', type='string', default='tabular', help='Choose output format') + parser.add_option('--filters', default='', help='Filter families') + + options, args = parser.parse_args() + + if options.genetree is None: + parser.error("--genetree option must be specified, GeneTree in nhx format") + + # reads single gene tree + genetree = PhyloTree(options.genetree) + + leaves_list = genetree.get_leaf_names() + # Genetree nodes are required to be in gene_species format + leaves_list = [_ for _ in leaves_list if '_' in _] + + species_list = [_.split("_")[1] for _ in leaves_list] + + species_dict = {} + for species in species_list: + count = "one" + if species in species_dict: + count = "many" + species_dict[species] = count + + homologies = { + 'one-to-one': [], + 'one-to-many': [], + 'many-to-one': [], + 'many-to-many': [], + 'paralogs': [] + } + + # stores relevant homology types in dict + for i, leaf1 in enumerate(leaves_list): + for leaf2 in leaves_list[i + 1:]: + id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1 + id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2 + species1 = id1.split("_")[1] + species2 = id2.split("_")[1] + if species1 == species2: + homology_type = 'paralogs' + else: + homology_type = species_dict[species1] + "-to-" + species_dict[species2] + homologies[homology_type].append((id1, id2)) + + options.filters = options.filters.split(",") + + if options.out_format == 'tabular': + for homology_type, homologs_list in homologies.items(): + # checks if homology type is in filter + if homology_type in options.filters: + for (gene1, gene2) in homologs_list: + print("%s\t%s\t%s" % (gene1, gene2, homology_type)) + elif options.out_format == 'csv': + print_family = True + for homology_type, homologs_list in homologies.items(): + if homologs_list and homology_type not in options.filters: + print_family = False + break + + # prints family if homology type is not found in filter + if print_family: + print(','.join(leaves_list)) + + +if __name__ == "__main__": + main()