Mercurial > repos > earlhaminst > ete
comparison ete_genetree_splitter.py @ 12:dc32007a6b36 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author | earlhaminst |
---|---|
date | Tue, 07 Jun 2022 08:58:05 +0000 |
parents | b29ee6a16524 |
children |
comparison
equal
deleted
inserted
replaced
11:2db72467da51 | 12:dc32007a6b36 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import optparse | 3 import optparse |
4 import os | |
5 import sys | |
4 | 6 |
5 from ete3 import PhyloTree | 7 from ete3 import PhyloTree |
6 | 8 |
7 | 9 |
8 def main(): | 10 def main(): |
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" | 11 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" |
10 parser = optparse.OptionParser(usage=usage) | 12 parser = optparse.OptionParser(usage=usage) |
11 parser.add_option('--genetree', help='GeneTree in nhx format') | 13 parser.add_option('--genetree', help='GeneTree in nhx format') |
12 parser.add_option('--speciestree', help='Species Tree in nhx format') | 14 parser.add_option('--speciestree', help='Species Tree in nhx format') |
15 parser.add_option('--ingroup', help='Species Tree in nhx format') | |
16 parser.add_option('--outgroup', help='Species Tree in nhx format') | |
13 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') | 17 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') |
14 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') | 18 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') |
15 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') | 19 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') |
16 parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') | 20 parser.add_option('--split', type='choice', choices=['dups', 'treeko', 'species'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') |
17 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') | 21 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') |
22 parser.add_option('-d', '--dir', type='string', default="", help="Absolute or relative path to output directory. If directory does not exist it will be created") | |
23 | |
18 options, args = parser.parse_args() | 24 options, args = parser.parse_args() |
25 | |
26 if options.dir and not os.path.exists(options.dir): | |
27 os.makedirs(options.dir) | |
19 | 28 |
20 if options.genetree is None: | 29 if options.genetree is None: |
21 parser.error("--genetree option must be specified, GeneTree in nhx format") | 30 parser.error("--genetree option must be specified, GeneTree in nhx format") |
31 | |
32 if os.stat(options.genetree).st_size == 0: | |
33 sys.exit() | |
22 | 34 |
23 with open(options.genetree, 'r') as f: | 35 with open(options.genetree, 'r') as f: |
24 contents = f.read() | 36 contents = f.read() |
25 | 37 |
26 # Remove empty NHX features that can be produced by TreeBest but break ete3 | 38 # Remove empty NHX features that can be produced by TreeBest but break ete3 |
33 if options.gene_node == 0: | 45 if options.gene_node == 0: |
34 genetree.set_species_naming_function(parse_sp_name) | 46 genetree.set_species_naming_function(parse_sp_name) |
35 | 47 |
36 # reconcile species tree with gene tree to help find out gene gain/lose | 48 # reconcile species tree with gene tree to help find out gene gain/lose |
37 if options.gainlose: | 49 if options.gainlose: |
38 | |
39 if options.speciestree is None: | 50 if options.speciestree is None: |
40 parser.error("--speciestree option must be specified, species tree in nhx format") | 51 parser.error("--speciestree option must be specified, species tree in nhx format") |
41 | 52 |
42 # reads species tree | 53 # reads species tree |
43 speciestree = PhyloTree(options.speciestree, format=options.species_format) | 54 speciestree = PhyloTree(options.speciestree, format=options.species_format) |
48 | 59 |
49 genetree, events = genetree.reconcile(speciestree) | 60 genetree, events = genetree.reconcile(speciestree) |
50 | 61 |
51 if options.split == "dups": | 62 if options.split == "dups": |
52 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. | 63 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. |
53 for cluster_id, node in enumerate(genetree.split_by_dups(), 1): | 64 for cluster_id, node in enumerate(genetree.split_by_dups(), start=1): |
54 outfile = str(cluster_id) + '_genetree.nhx' | 65 outfile = '{}_genetree.nhx'.format(cluster_id) |
66 if options.dir: | |
67 outfile = os.path.join(options.dir, outfile) | |
55 with open(outfile, 'w') as f: | 68 with open(outfile, 'w') as f: |
56 f.write(node.write(format=options.output_format)) | 69 f.write(node.write(format=options.output_format)) |
57 elif options.split == "treeko": | 70 elif options.split == "treeko": |
58 # splits tree using the TreeKO algorithm. | 71 # splits tree using the TreeKO algorithm. |
59 ntrees, ndups, sptrees = genetree.get_speciation_trees() | 72 ntrees, ndups, sptrees = genetree.get_speciation_trees() |
60 | 73 |
61 cluster_id = 0 | 74 for cluster_id, spt in enumerate(sptrees, start=1): |
62 for spt in sptrees: | 75 outfile = '{}_genetree.nhx'.format(cluster_id) |
63 cluster_id = cluster_id + 1 | 76 if options.dir: |
64 outfile = str(cluster_id) + '_genetree.nhx' | 77 outfile = os.path.join(options.dir, outfile) |
65 with open(outfile, 'w') as f: | 78 with open(outfile, 'w') as f: |
66 f.write(spt.write(format=options.output_format)) | 79 f.write(spt.write(format=options.output_format)) |
80 elif options.split == "species": | |
81 ingroup = options.ingroup.split(",") | |
82 outgroup = options.outgroup.split(",") | |
83 cluster_id = 0 | |
84 | |
85 def split_tree_by_species(tree, ingroup, outgroup): | |
86 nonlocal cluster_id | |
87 | |
88 if len(outgroup) > 0: | |
89 outgroup_bool = check_outgroup(tree, outgroup) | |
90 else: | |
91 outgroup_bool = True | |
92 | |
93 if outgroup_bool and check_ingroup(tree, ingroup): | |
94 child1, child2 = tree.children | |
95 split_tree_by_species(child1, ingroup, outgroup) | |
96 split_tree_by_species(child2, ingroup, outgroup) | |
97 else: | |
98 cluster_id += 1 | |
99 outfile = '{}_genetree.nhx'.format(cluster_id) | |
100 if options.dir: | |
101 outfile = os.path.join(options.dir, outfile) | |
102 with open(outfile, 'w') as f: | |
103 f.write(tree.write(format=options.output_format)) | |
104 | |
105 split_tree_by_species(genetree, ingroup, outgroup) | |
106 | |
107 | |
108 def check_outgroup(tree, outgroup): | |
109 species = get_species(tree) | |
110 | |
111 count = 0 | |
112 | |
113 for out in outgroup: | |
114 if species.count(out) > 1: | |
115 count = count + 1 | |
116 | |
117 return count >= len(outgroup) / 2 | |
118 | |
119 | |
120 def check_ingroup(tree, ingroup): | |
121 species = get_species(tree) | |
122 | |
123 count = 0 | |
124 | |
125 for ing in ingroup: | |
126 if species.count(ing) > 1: | |
127 count = count + 1 | |
128 | |
129 return count > 0 and len(ingroup) / count >= 0.8 | |
67 | 130 |
68 | 131 |
69 def parse_sp_name(node_name): | 132 def parse_sp_name(node_name): |
70 return node_name.split("_")[1] | 133 return node_name.split("_")[-1] |
134 | |
135 | |
136 def get_species(node): | |
137 leaves_list = node.get_leaf_names() | |
138 # Genetree nodes are required to be in gene_species format | |
139 leaves_list = [_ for _ in leaves_list if '_' in _] | |
140 | |
141 species_list = [_.split("_")[-1] for _ in leaves_list] | |
142 | |
143 return species_list | |
71 | 144 |
72 | 145 |
73 if __name__ == "__main__": | 146 if __name__ == "__main__": |
74 main() | 147 main() |