comparison ete_gene_cnv.py @ 8:16e925bf567e draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
author earlhaminst
date Thu, 31 Oct 2019 07:48:59 -0400
parents
children ed74587a13c8
comparison
equal deleted inserted replaced
7:6a5282f71f82 8:16e925bf567e
1 from __future__ import print_function
2
3 import argparse
4 import collections
5
6 from ete3 import PhyloTree
7
8
9 def printTSV(myDict, colList=None):
10 """ Pretty print a list of dictionaries (myDict) as a dynamically sized table.
11 If column names (colList) aren't specified, they will show in random order.
12 Author: Thierry Husson - Use it as you want but don't blame me.
13 """
14 if not colList:
15 colList = list(myDict[0].keys() if myDict else [])
16
17 myList = [colList]
18
19 for item in myDict:
20 myList.append([str(item[col] if item[col] is not None else '') for col in colList])
21
22 for item in myList:
23 print(*item, sep="\t")
24
25
26 def main():
27 parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
28 parser.add_argument('--genetree', required=True, help='GeneTree in nhx format')
29 parser.add_argument('--speciesorder', required=True, help='Comma-separated species list')
30 args = parser.parse_args()
31
32 species_list = args.speciesorder.split(",")
33 species_list = [_.strip() for _ in species_list]
34 table = []
35
36 with open(args.genetree, "r") as f:
37 # reads multiple gene tree line by line gene tree
38 for line in f:
39 # Remove empty NHX features that can be produced by TreeBest but break ete3
40 line = line.replace('[&&NHX]', '')
41
42 # reads single gene tree
43 genetree = PhyloTree(line)
44 leaves = genetree.get_leaf_names()
45
46 leaves_parts = [_.split("_") for _ in leaves]
47 for i, leaf_parts in enumerate(leaves_parts):
48 if len(leaf_parts) != 2:
49 raise Exception("Leaf node '%s' is not in gene_species format" % leaves[i])
50
51 leaves_species = [_[1] for _ in leaves_parts]
52 species_counter = collections.Counter(leaves_species)
53
54 # Assign to ref_species the first element of species_list which
55 # appears in a leaf node
56 for ref_species in species_list:
57 if ref_species in species_counter:
58 break
59 else:
60 raise Exception("None of the specified species was found in the GeneTree '%s'" % line)
61
62 # Find the gene of the (first) leaf node for the ref_species
63 for leaf_parts in leaves_parts:
64 if leaf_parts[1] == ref_species:
65 species_counter['gene'] = leaf_parts[0]
66 break
67
68 table.append(species_counter)
69
70 colList = ["gene"] + species_list
71 printTSV(table, colList)
72
73
74 if __name__ == "__main__":
75 main()