Mercurial > repos > earlhaminst > ete
diff ete_genetree_splitter.py @ 12:dc32007a6b36 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author | earlhaminst |
---|---|
date | Tue, 07 Jun 2022 08:58:05 +0000 |
parents | b29ee6a16524 |
children |
line wrap: on
line diff
--- a/ete_genetree_splitter.py Thu Mar 10 14:01:44 2022 +0000 +++ b/ete_genetree_splitter.py Tue Jun 07 08:58:05 2022 +0000 @@ -1,6 +1,8 @@ from __future__ import print_function import optparse +import os +import sys from ete3 import PhyloTree @@ -10,16 +12,26 @@ parser = optparse.OptionParser(usage=usage) parser.add_option('--genetree', help='GeneTree in nhx format') parser.add_option('--speciestree', help='Species Tree in nhx format') + parser.add_option('--ingroup', help='Species Tree in nhx format') + parser.add_option('--outgroup', help='Species Tree in nhx format') parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') - parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') + parser.add_option('--split', type='choice', choices=['dups', 'treeko', 'species'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') + parser.add_option('-d', '--dir', type='string', default="", help="Absolute or relative path to output directory. If directory does not exist it will be created") + options, args = parser.parse_args() + if options.dir and not os.path.exists(options.dir): + os.makedirs(options.dir) + if options.genetree is None: parser.error("--genetree option must be specified, GeneTree in nhx format") + if os.stat(options.genetree).st_size == 0: + sys.exit() + with open(options.genetree, 'r') as f: contents = f.read() @@ -35,7 +47,6 @@ # reconcile species tree with gene tree to help find out gene gain/lose if options.gainlose: - if options.speciestree is None: parser.error("--speciestree option must be specified, species tree in nhx format") @@ -50,24 +61,86 @@ if options.split == "dups": # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. - for cluster_id, node in enumerate(genetree.split_by_dups(), 1): - outfile = str(cluster_id) + '_genetree.nhx' + for cluster_id, node in enumerate(genetree.split_by_dups(), start=1): + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) with open(outfile, 'w') as f: f.write(node.write(format=options.output_format)) elif options.split == "treeko": # splits tree using the TreeKO algorithm. ntrees, ndups, sptrees = genetree.get_speciation_trees() - cluster_id = 0 - for spt in sptrees: - cluster_id = cluster_id + 1 - outfile = str(cluster_id) + '_genetree.nhx' + for cluster_id, spt in enumerate(sptrees, start=1): + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) with open(outfile, 'w') as f: f.write(spt.write(format=options.output_format)) + elif options.split == "species": + ingroup = options.ingroup.split(",") + outgroup = options.outgroup.split(",") + cluster_id = 0 + + def split_tree_by_species(tree, ingroup, outgroup): + nonlocal cluster_id + + if len(outgroup) > 0: + outgroup_bool = check_outgroup(tree, outgroup) + else: + outgroup_bool = True + + if outgroup_bool and check_ingroup(tree, ingroup): + child1, child2 = tree.children + split_tree_by_species(child1, ingroup, outgroup) + split_tree_by_species(child2, ingroup, outgroup) + else: + cluster_id += 1 + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) + with open(outfile, 'w') as f: + f.write(tree.write(format=options.output_format)) + + split_tree_by_species(genetree, ingroup, outgroup) + + +def check_outgroup(tree, outgroup): + species = get_species(tree) + + count = 0 + + for out in outgroup: + if species.count(out) > 1: + count = count + 1 + + return count >= len(outgroup) / 2 + + +def check_ingroup(tree, ingroup): + species = get_species(tree) + + count = 0 + + for ing in ingroup: + if species.count(ing) > 1: + count = count + 1 + + return count > 0 and len(ingroup) / count >= 0.8 def parse_sp_name(node_name): - return node_name.split("_")[1] + return node_name.split("_")[-1] + + +def get_species(node): + leaves_list = node.get_leaf_names() + # Genetree nodes are required to be in gene_species format + leaves_list = [_ for _ in leaves_list if '_' in _] + + species_list = [_.split("_")[-1] for _ in leaves_list] + + return species_list if __name__ == "__main__":