comparison ete_genetree_splitter.py @ 12:dc32007a6b36 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author earlhaminst
date Tue, 07 Jun 2022 08:58:05 +0000
parents b29ee6a16524
children
comparison
equal deleted inserted replaced
11:2db72467da51 12:dc32007a6b36
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import optparse 3 import optparse
4 import os
5 import sys
4 6
5 from ete3 import PhyloTree 7 from ete3 import PhyloTree
6 8
7 9
8 def main(): 10 def main():
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" 11 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
10 parser = optparse.OptionParser(usage=usage) 12 parser = optparse.OptionParser(usage=usage)
11 parser.add_option('--genetree', help='GeneTree in nhx format') 13 parser.add_option('--genetree', help='GeneTree in nhx format')
12 parser.add_option('--speciestree', help='Species Tree in nhx format') 14 parser.add_option('--speciestree', help='Species Tree in nhx format')
15 parser.add_option('--ingroup', help='Species Tree in nhx format')
16 parser.add_option('--outgroup', help='Species Tree in nhx format')
13 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') 17 parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)')
14 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') 18 parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene')
15 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') 19 parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose')
16 parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') 20 parser.add_option('--split', type='choice', choices=['dups', 'treeko', 'species'], dest="split", default='dups', help='Choose GeneTree splitting algorithms')
17 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') 21 parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)')
22 parser.add_option('-d', '--dir', type='string', default="", help="Absolute or relative path to output directory. If directory does not exist it will be created")
23
18 options, args = parser.parse_args() 24 options, args = parser.parse_args()
25
26 if options.dir and not os.path.exists(options.dir):
27 os.makedirs(options.dir)
19 28
20 if options.genetree is None: 29 if options.genetree is None:
21 parser.error("--genetree option must be specified, GeneTree in nhx format") 30 parser.error("--genetree option must be specified, GeneTree in nhx format")
31
32 if os.stat(options.genetree).st_size == 0:
33 sys.exit()
22 34
23 with open(options.genetree, 'r') as f: 35 with open(options.genetree, 'r') as f:
24 contents = f.read() 36 contents = f.read()
25 37
26 # Remove empty NHX features that can be produced by TreeBest but break ete3 38 # Remove empty NHX features that can be produced by TreeBest but break ete3
33 if options.gene_node == 0: 45 if options.gene_node == 0:
34 genetree.set_species_naming_function(parse_sp_name) 46 genetree.set_species_naming_function(parse_sp_name)
35 47
36 # reconcile species tree with gene tree to help find out gene gain/lose 48 # reconcile species tree with gene tree to help find out gene gain/lose
37 if options.gainlose: 49 if options.gainlose:
38
39 if options.speciestree is None: 50 if options.speciestree is None:
40 parser.error("--speciestree option must be specified, species tree in nhx format") 51 parser.error("--speciestree option must be specified, species tree in nhx format")
41 52
42 # reads species tree 53 # reads species tree
43 speciestree = PhyloTree(options.speciestree, format=options.species_format) 54 speciestree = PhyloTree(options.speciestree, format=options.species_format)
48 59
49 genetree, events = genetree.reconcile(speciestree) 60 genetree, events = genetree.reconcile(speciestree)
50 61
51 if options.split == "dups": 62 if options.split == "dups":
52 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. 63 # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes.
53 for cluster_id, node in enumerate(genetree.split_by_dups(), 1): 64 for cluster_id, node in enumerate(genetree.split_by_dups(), start=1):
54 outfile = str(cluster_id) + '_genetree.nhx' 65 outfile = '{}_genetree.nhx'.format(cluster_id)
66 if options.dir:
67 outfile = os.path.join(options.dir, outfile)
55 with open(outfile, 'w') as f: 68 with open(outfile, 'w') as f:
56 f.write(node.write(format=options.output_format)) 69 f.write(node.write(format=options.output_format))
57 elif options.split == "treeko": 70 elif options.split == "treeko":
58 # splits tree using the TreeKO algorithm. 71 # splits tree using the TreeKO algorithm.
59 ntrees, ndups, sptrees = genetree.get_speciation_trees() 72 ntrees, ndups, sptrees = genetree.get_speciation_trees()
60 73
61 cluster_id = 0 74 for cluster_id, spt in enumerate(sptrees, start=1):
62 for spt in sptrees: 75 outfile = '{}_genetree.nhx'.format(cluster_id)
63 cluster_id = cluster_id + 1 76 if options.dir:
64 outfile = str(cluster_id) + '_genetree.nhx' 77 outfile = os.path.join(options.dir, outfile)
65 with open(outfile, 'w') as f: 78 with open(outfile, 'w') as f:
66 f.write(spt.write(format=options.output_format)) 79 f.write(spt.write(format=options.output_format))
80 elif options.split == "species":
81 ingroup = options.ingroup.split(",")
82 outgroup = options.outgroup.split(",")
83 cluster_id = 0
84
85 def split_tree_by_species(tree, ingroup, outgroup):
86 nonlocal cluster_id
87
88 if len(outgroup) > 0:
89 outgroup_bool = check_outgroup(tree, outgroup)
90 else:
91 outgroup_bool = True
92
93 if outgroup_bool and check_ingroup(tree, ingroup):
94 child1, child2 = tree.children
95 split_tree_by_species(child1, ingroup, outgroup)
96 split_tree_by_species(child2, ingroup, outgroup)
97 else:
98 cluster_id += 1
99 outfile = '{}_genetree.nhx'.format(cluster_id)
100 if options.dir:
101 outfile = os.path.join(options.dir, outfile)
102 with open(outfile, 'w') as f:
103 f.write(tree.write(format=options.output_format))
104
105 split_tree_by_species(genetree, ingroup, outgroup)
106
107
108 def check_outgroup(tree, outgroup):
109 species = get_species(tree)
110
111 count = 0
112
113 for out in outgroup:
114 if species.count(out) > 1:
115 count = count + 1
116
117 return count >= len(outgroup) / 2
118
119
120 def check_ingroup(tree, ingroup):
121 species = get_species(tree)
122
123 count = 0
124
125 for ing in ingroup:
126 if species.count(ing) > 1:
127 count = count + 1
128
129 return count > 0 and len(ingroup) / count >= 0.8
67 130
68 131
69 def parse_sp_name(node_name): 132 def parse_sp_name(node_name):
70 return node_name.split("_")[1] 133 return node_name.split("_")[-1]
134
135
136 def get_species(node):
137 leaves_list = node.get_leaf_names()
138 # Genetree nodes are required to be in gene_species format
139 leaves_list = [_ for _ in leaves_list if '_' in _]
140
141 species_list = [_.split("_")[-1] for _ in leaves_list]
142
143 return species_list
71 144
72 145
73 if __name__ == "__main__": 146 if __name__ == "__main__":
74 main() 147 main()