annotate ete_gene_cnv.py @ 9:b29ee6a16524 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
author earlhaminst
date Tue, 20 Oct 2020 15:10:40 +0000
parents 16e925bf567e
children ed74587a13c8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
1 from __future__ import print_function
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
2
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
3 import argparse
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
4 import collections
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
5
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
6 from ete3 import PhyloTree
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
7
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
8
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
9 def printTSV(myDict, colList=None):
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
10 """ Pretty print a list of dictionaries (myDict) as a dynamically sized table.
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
11 If column names (colList) aren't specified, they will show in random order.
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
12 Author: Thierry Husson - Use it as you want but don't blame me.
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
13 """
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
14 if not colList:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
15 colList = list(myDict[0].keys() if myDict else [])
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
16
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
17 myList = [colList]
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
18
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
19 for item in myDict:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
20 myList.append([str(item[col] if item[col] is not None else '') for col in colList])
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
21
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
22 for item in myList:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
23 print(*item, sep="\t")
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
24
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
25
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
26 def main():
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
27 parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
28 parser.add_argument('--genetree', required=True, help='GeneTree in nhx format')
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
29 parser.add_argument('--speciesorder', required=True, help='Comma-separated species list')
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
30 args = parser.parse_args()
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
31
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
32 species_list = args.speciesorder.split(",")
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
33 species_list = [_.strip() for _ in species_list]
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
34 table = []
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
35
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
36 with open(args.genetree, "r") as f:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
37 # reads multiple gene tree line by line gene tree
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
38 for line in f:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
39 # Remove empty NHX features that can be produced by TreeBest but break ete3
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
40 line = line.replace('[&&NHX]', '')
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
41
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
42 # reads single gene tree
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
43 genetree = PhyloTree(line)
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
44 leaves = genetree.get_leaf_names()
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
45
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
46 leaves_parts = [_.split("_") for _ in leaves]
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
47 for i, leaf_parts in enumerate(leaves_parts):
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
48 if len(leaf_parts) != 2:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
49 raise Exception("Leaf node '%s' is not in gene_species format" % leaves[i])
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
50
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
51 leaves_species = [_[1] for _ in leaves_parts]
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
52 species_counter = collections.Counter(leaves_species)
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
53
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
54 # Assign to ref_species the first element of species_list which
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
55 # appears in a leaf node
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
56 for ref_species in species_list:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
57 if ref_species in species_counter:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
58 break
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
59 else:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
60 raise Exception("None of the specified species was found in the GeneTree '%s'" % line)
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
61
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
62 # Find the gene of the (first) leaf node for the ref_species
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
63 for leaf_parts in leaves_parts:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
64 if leaf_parts[1] == ref_species:
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
65 species_counter['gene'] = leaf_parts[0]
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
66 break
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
67
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
68 table.append(species_counter)
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
69
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
70 colList = ["gene"] + species_list
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
71 printTSV(table, colList)
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
72
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
73
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
74 if __name__ == "__main__":
16e925bf567e "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff changeset
75 main()