annotate ete_homology_classifier.py @ 6:f1eca1158f21 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
author earlhaminst
date Wed, 10 Oct 2018 05:24:04 -0400
parents 817031b8486d
children ed74587a13c8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
2
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
3 import optparse
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
4
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
5 from ete3 import PhyloTree
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
6
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
7
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
8 def main():
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
10 parser = optparse.OptionParser(usage=usage)
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
11 parser.add_option('--genetree', help='GeneTree in nhx format')
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
12 parser.add_option('--out_format', type='string', default='tabular', help='Choose output format')
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
13 parser.add_option('--filters', default='', help='Filter families')
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
14
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
15 options, args = parser.parse_args()
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
16
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
17 if options.genetree is None:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
18 parser.error("--genetree option must be specified, GeneTree in nhx format")
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
19
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
20 with open(options.genetree, 'r') as f:
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
21 contents = f.read()
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
22
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
23 # Remove empty NHX features that can be produced by TreeBest but break ete3
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
24 contents = contents.replace('[&&NHX]', '')
5
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
25 # reads single gene tree
6
f1eca1158f21 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents: 5
diff changeset
26 genetree = PhyloTree(contents)
5
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
27
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
28 leaves_list = genetree.get_leaf_names()
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
29 # Genetree nodes are required to be in gene_species format
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
30 leaves_list = [_ for _ in leaves_list if '_' in _]
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
31
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
32 species_list = [_.split("_")[1] for _ in leaves_list]
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
33
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
34 species_dict = {}
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
35 for species in species_list:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
36 count = "one"
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
37 if species in species_dict:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
38 count = "many"
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
39 species_dict[species] = count
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
40
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
41 homologies = {
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
42 'one-to-one': [],
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
43 'one-to-many': [],
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
44 'many-to-one': [],
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
45 'many-to-many': [],
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
46 'paralogs': []
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
47 }
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
48
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
49 # stores relevant homology types in dict
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
50 for i, leaf1 in enumerate(leaves_list):
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
51 for leaf2 in leaves_list[i + 1:]:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
52 id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
53 id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
54 species1 = id1.split("_")[1]
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
55 species2 = id2.split("_")[1]
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
56 if species1 == species2:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
57 homology_type = 'paralogs'
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
58 else:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
59 homology_type = species_dict[species1] + "-to-" + species_dict[species2]
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
60 homologies[homology_type].append((id1, id2))
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
61
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
62 options.filters = options.filters.split(",")
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
63
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
64 if options.out_format == 'tabular':
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
65 for homology_type, homologs_list in homologies.items():
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
66 # checks if homology type is in filter
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
67 if homology_type in options.filters:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
68 for (gene1, gene2) in homologs_list:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
69 print("%s\t%s\t%s" % (gene1, gene2, homology_type))
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
70 elif options.out_format == 'csv':
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
71 print_family = True
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
72 for homology_type, homologs_list in homologies.items():
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
73 if homologs_list and homology_type not in options.filters:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
74 print_family = False
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
75 break
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
76
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
77 # prints family if homology type is not found in filter
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
78 if print_family:
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
79 print(','.join(leaves_list))
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
80
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
81
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
82 if __name__ == "__main__":
817031b8486d planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff changeset
83 main()