Mercurial > repos > earlhaminst > ete
annotate ete_homology_classifier.py @ 13:ed74587a13c8 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 23b3c7c09a7d391b576e3b19f8b34dd63d636bdc
author | earlhaminst |
---|---|
date | Thu, 01 Sep 2022 16:11:32 +0000 |
parents | f1eca1158f21 |
children |
rev | line source |
---|---|
5
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
1 from __future__ import print_function |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
2 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
3 import optparse |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
4 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
5 from ete3 import PhyloTree |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
6 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
7 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
8 def main(): |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
9 usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
10 parser = optparse.OptionParser(usage=usage) |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
11 parser.add_option('--genetree', help='GeneTree in nhx format') |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
12 parser.add_option('--out_format', type='string', default='tabular', help='Choose output format') |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
13 parser.add_option('--filters', default='', help='Filter families') |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
14 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
15 options, args = parser.parse_args() |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
16 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
17 if options.genetree is None: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
18 parser.error("--genetree option must be specified, GeneTree in nhx format") |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
19 |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
20 with open(options.genetree, 'r') as f: |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
21 contents = f.read() |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
22 |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
23 # Remove empty NHX features that can be produced by TreeBest but break ete3 |
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
24 contents = contents.replace('[&&NHX]', '') |
5
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
25 # reads single gene tree |
6
f1eca1158f21
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 133bb57feca9672734d664e6b34e428488cf2e73
earlhaminst
parents:
5
diff
changeset
|
26 genetree = PhyloTree(contents) |
5
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
27 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
28 leaves_list = genetree.get_leaf_names() |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
29 # Genetree nodes are required to be in gene_species format |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
30 leaves_list = [_ for _ in leaves_list if '_' in _] |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
31 |
13
ed74587a13c8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 23b3c7c09a7d391b576e3b19f8b34dd63d636bdc
earlhaminst
parents:
6
diff
changeset
|
32 species_list = [_.split("_")[-1] for _ in leaves_list] |
5
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
33 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
34 species_dict = {} |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
35 for species in species_list: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
36 count = "one" |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
37 if species in species_dict: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
38 count = "many" |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
39 species_dict[species] = count |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
40 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
41 homologies = { |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
42 'one-to-one': [], |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
43 'one-to-many': [], |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
44 'many-to-one': [], |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
45 'many-to-many': [], |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
46 'paralogs': [] |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
47 } |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
48 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
49 # stores relevant homology types in dict |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
50 for i, leaf1 in enumerate(leaves_list): |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
51 for leaf2 in leaves_list[i + 1:]: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
52 id1 = leaf1.split(":")[1] if ":" in leaf1 else leaf1 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
53 id2 = leaf2.split(":")[1] if ":" in leaf2 else leaf2 |
13
ed74587a13c8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 23b3c7c09a7d391b576e3b19f8b34dd63d636bdc
earlhaminst
parents:
6
diff
changeset
|
54 species1 = id1.split("_")[-1] |
ed74587a13c8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 23b3c7c09a7d391b576e3b19f8b34dd63d636bdc
earlhaminst
parents:
6
diff
changeset
|
55 species2 = id2.split("_")[-1] |
5
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
56 if species1 == species2: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
57 homology_type = 'paralogs' |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
58 else: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
59 homology_type = species_dict[species1] + "-to-" + species_dict[species2] |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
60 homologies[homology_type].append((id1, id2)) |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
61 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
62 options.filters = options.filters.split(",") |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
63 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
64 if options.out_format == 'tabular': |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
65 for homology_type, homologs_list in homologies.items(): |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
66 # checks if homology type is in filter |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
67 if homology_type in options.filters: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
68 for (gene1, gene2) in homologs_list: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
69 print("%s\t%s\t%s" % (gene1, gene2, homology_type)) |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
70 elif options.out_format == 'csv': |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
71 print_family = True |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
72 for homology_type, homologs_list in homologies.items(): |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
73 if homologs_list and homology_type not in options.filters: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
74 print_family = False |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
75 break |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
76 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
77 # prints family if homology type is not found in filter |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
78 if print_family: |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
79 print(','.join(leaves_list)) |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
80 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
81 |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
82 if __name__ == "__main__": |
817031b8486d
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit b97aee603b9acf29981719160e963a1efe2946d0
earlhaminst
parents:
diff
changeset
|
83 main() |