Mercurial > repos > earlhaminst > ete
annotate ete_gene_cnv.py @ 9:b29ee6a16524 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit 17c65045b726d0695814bfe761e534f6521786f1"
author | earlhaminst |
---|---|
date | Tue, 20 Oct 2020 15:10:40 +0000 |
parents | 16e925bf567e |
children | ed74587a13c8 |
rev | line source |
---|---|
8
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
1 from __future__ import print_function |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
2 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
3 import argparse |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
4 import collections |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
5 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
6 from ete3 import PhyloTree |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
7 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
8 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
9 def printTSV(myDict, colList=None): |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
10 """ Pretty print a list of dictionaries (myDict) as a dynamically sized table. |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
11 If column names (colList) aren't specified, they will show in random order. |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
12 Author: Thierry Husson - Use it as you want but don't blame me. |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
13 """ |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
14 if not colList: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
15 colList = list(myDict[0].keys() if myDict else []) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
16 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
17 myList = [colList] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
18 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
19 for item in myDict: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
20 myList.append([str(item[col] if item[col] is not None else '') for col in colList]) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
21 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
22 for item in myList: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
23 print(*item, sep="\t") |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
24 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
25 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
26 def main(): |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
27 parser = argparse.ArgumentParser(description='Gene Copy Number Finder') |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
28 parser.add_argument('--genetree', required=True, help='GeneTree in nhx format') |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
29 parser.add_argument('--speciesorder', required=True, help='Comma-separated species list') |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
30 args = parser.parse_args() |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
31 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
32 species_list = args.speciesorder.split(",") |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
33 species_list = [_.strip() for _ in species_list] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
34 table = [] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
35 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
36 with open(args.genetree, "r") as f: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
37 # reads multiple gene tree line by line gene tree |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
38 for line in f: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
39 # Remove empty NHX features that can be produced by TreeBest but break ete3 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
40 line = line.replace('[&&NHX]', '') |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
41 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
42 # reads single gene tree |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
43 genetree = PhyloTree(line) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
44 leaves = genetree.get_leaf_names() |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
45 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
46 leaves_parts = [_.split("_") for _ in leaves] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
47 for i, leaf_parts in enumerate(leaves_parts): |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
48 if len(leaf_parts) != 2: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
49 raise Exception("Leaf node '%s' is not in gene_species format" % leaves[i]) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
50 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
51 leaves_species = [_[1] for _ in leaves_parts] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
52 species_counter = collections.Counter(leaves_species) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
53 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
54 # Assign to ref_species the first element of species_list which |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
55 # appears in a leaf node |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
56 for ref_species in species_list: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
57 if ref_species in species_counter: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
58 break |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
59 else: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
60 raise Exception("None of the specified species was found in the GeneTree '%s'" % line) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
61 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
62 # Find the gene of the (first) leaf node for the ref_species |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
63 for leaf_parts in leaves_parts: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
64 if leaf_parts[1] == ref_species: |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
65 species_counter['gene'] = leaf_parts[0] |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
66 break |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
67 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
68 table.append(species_counter) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
69 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
70 colList = ["gene"] + species_list |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
71 printTSV(table, colList) |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
72 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
73 |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
74 if __name__ == "__main__": |
16e925bf567e
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
earlhaminst
parents:
diff
changeset
|
75 main() |