comparison ete_homology_classifier.py @ 5:817031b8486d draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
author earlhaminst
date Thu, 10 May 2018 06:15:17 -0400
parents
children f1eca1158f21
comparison
equal deleted inserted replaced
4:87b6de3ef63e 5:817031b8486d
1 from __future__ import print_function
2
3 import optparse
4
5 from ete3 import PhyloTree
6
7
8 def main():
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
10 parser = optparse.OptionParser(usage=usage)
11 parser.add_option('--genetree', help='GeneTree in nhx format')
12 parser.add_option('--out_format', type='string', default='tabular', help='Choose output format')
13 parser.add_option('--filters', default='', help='Filter families')
14
15 options, args = parser.parse_args()
16
17 if options.genetree is None:
18 parser.error("--genetree option must be specified, GeneTree in nhx format")
19
20 # reads single gene tree
21 genetree = PhyloTree(options.genetree)
22
23 leaves_list = genetree.get_leaf_names()
24 # Genetree nodes are required to be in gene_species format
25 leaves_list = [_ for _ in leaves_list if '_' in _]
26
27 species_list = [_.split("_")[1] for _ in leaves_list]
28
29 species_dict = {}
30 for species in species_list:
31 count = "one"
32 if species in species_dict:
33 count = "many"
34 species_dict[species] = count
35
36 homologies = {
37 'one-to-one': [],
38 'one-to-many': [],
39 'many-to-one': [],
40 'many-to-many': [],
41 'paralogs': []
42 }
43
44 # stores relevant homology types in dict
45 for i, leaf1 in enumerate(leaves_list):
46 for leaf2 in leaves_list[i + 1:]:
47 id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1
48 id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2
49 species1 = id1.split("_")[1]
50 species2 = id2.split("_")[1]
51 if species1 == species2:
52 homology_type = 'paralogs'
53 else:
54 homology_type = species_dict[species1] + "-to-" + species_dict[species2]
55 homologies[homology_type].append((id1, id2))
56
57 options.filters = options.filters.split(",")
58
59 if options.out_format == 'tabular':
60 for homology_type, homologs_list in homologies.items():
61 # checks if homology type is in filter
62 if homology_type in options.filters:
63 for (gene1, gene2) in homologs_list:
64 print("%s\t%s\t%s" % (gene1, gene2, homology_type))
65 elif options.out_format == 'csv':
66 print_family = True
67 for homology_type, homologs_list in homologies.items():
68 if homologs_list and homology_type not in options.filters:
69 print_family = False
70 break
71
72 # prints family if homology type is not found in filter
73 if print_family:
74 print(','.join(leaves_list))
75
76
77 if __name__ == "__main__":
78 main()