Mercurial > repos > earlhaminst > ete
changeset 12:dc32007a6b36 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author | earlhaminst |
---|---|
date | Tue, 07 Jun 2022 08:58:05 +0000 |
parents | 2db72467da51 |
children | ed74587a13c8 |
files | ete_genetree_splitter.py ete_genetree_splitter.xml test-data/41_genetree.nhx test-data/42_genetree.nhx test-data/43_genetree.nhx test-data/44_genetree.nhx |
diffstat | 6 files changed, 122 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/ete_genetree_splitter.py Thu Mar 10 14:01:44 2022 +0000 +++ b/ete_genetree_splitter.py Tue Jun 07 08:58:05 2022 +0000 @@ -1,6 +1,8 @@ from __future__ import print_function import optparse +import os +import sys from ete3 import PhyloTree @@ -10,16 +12,26 @@ parser = optparse.OptionParser(usage=usage) parser.add_option('--genetree', help='GeneTree in nhx format') parser.add_option('--speciestree', help='Species Tree in nhx format') + parser.add_option('--ingroup', help='Species Tree in nhx format') + parser.add_option('--outgroup', help='Species Tree in nhx format') parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') - parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') + parser.add_option('--split', type='choice', choices=['dups', 'treeko', 'species'], dest="split", default='dups', help='Choose GeneTree splitting algorithms') parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') + parser.add_option('-d', '--dir', type='string', default="", help="Absolute or relative path to output directory. If directory does not exist it will be created") + options, args = parser.parse_args() + if options.dir and not os.path.exists(options.dir): + os.makedirs(options.dir) + if options.genetree is None: parser.error("--genetree option must be specified, GeneTree in nhx format") + if os.stat(options.genetree).st_size == 0: + sys.exit() + with open(options.genetree, 'r') as f: contents = f.read() @@ -35,7 +47,6 @@ # reconcile species tree with gene tree to help find out gene gain/lose if options.gainlose: - if options.speciestree is None: parser.error("--speciestree option must be specified, species tree in nhx format") @@ -50,24 +61,86 @@ if options.split == "dups": # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. - for cluster_id, node in enumerate(genetree.split_by_dups(), 1): - outfile = str(cluster_id) + '_genetree.nhx' + for cluster_id, node in enumerate(genetree.split_by_dups(), start=1): + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) with open(outfile, 'w') as f: f.write(node.write(format=options.output_format)) elif options.split == "treeko": # splits tree using the TreeKO algorithm. ntrees, ndups, sptrees = genetree.get_speciation_trees() - cluster_id = 0 - for spt in sptrees: - cluster_id = cluster_id + 1 - outfile = str(cluster_id) + '_genetree.nhx' + for cluster_id, spt in enumerate(sptrees, start=1): + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) with open(outfile, 'w') as f: f.write(spt.write(format=options.output_format)) + elif options.split == "species": + ingroup = options.ingroup.split(",") + outgroup = options.outgroup.split(",") + cluster_id = 0 + + def split_tree_by_species(tree, ingroup, outgroup): + nonlocal cluster_id + + if len(outgroup) > 0: + outgroup_bool = check_outgroup(tree, outgroup) + else: + outgroup_bool = True + + if outgroup_bool and check_ingroup(tree, ingroup): + child1, child2 = tree.children + split_tree_by_species(child1, ingroup, outgroup) + split_tree_by_species(child2, ingroup, outgroup) + else: + cluster_id += 1 + outfile = '{}_genetree.nhx'.format(cluster_id) + if options.dir: + outfile = os.path.join(options.dir, outfile) + with open(outfile, 'w') as f: + f.write(tree.write(format=options.output_format)) + + split_tree_by_species(genetree, ingroup, outgroup) + + +def check_outgroup(tree, outgroup): + species = get_species(tree) + + count = 0 + + for out in outgroup: + if species.count(out) > 1: + count = count + 1 + + return count >= len(outgroup) / 2 + + +def check_ingroup(tree, ingroup): + species = get_species(tree) + + count = 0 + + for ing in ingroup: + if species.count(ing) > 1: + count = count + 1 + + return count > 0 and len(ingroup) / count >= 0.8 def parse_sp_name(node_name): - return node_name.split("_")[1] + return node_name.split("_")[-1] + + +def get_species(node): + leaves_list = node.get_leaf_names() + # Genetree nodes are required to be in gene_species format + leaves_list = [_ for _ in leaves_list if '_' in _] + + species_list = [_.split("_")[-1] for _ in leaves_list] + + return species_list if __name__ == "__main__":
--- a/ete_genetree_splitter.xml Thu Mar 10 14:01:44 2022 +0000 +++ b/ete_genetree_splitter.xml Tue Jun 07 08:58:05 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="ete_genetree_splitter" name="ETE GeneTree splitter" version="@VERSION@"> +<tool id="ete_genetree_splitter" name="ETE GeneTree splitter" version="@VERSION@+galaxy1"> <description>from a genetree using the ETE Toolkit</description> <macros> <import>ete_macros.xml</import> @@ -12,24 +12,37 @@ python '$__tool_directory__/ete_genetree_splitter.py' --genetree '$genetreeFile' --gene_node $gene_node ---split $splitter +--split $splitting_conditional.splitter #if $gainlose_conditional.gainlose == "True" --speciestree '$gainlose_conditional.speciesFile' --species_format $gainlose_conditional.species_format --gainlose #end if +#if $splitting_conditional.splitter == "species" + --ingroup '$splitting_conditional.ingroup' + --outgroup '$splitting_conditional.outgroup' +#end if --output_format $output_format ]]></command> <inputs> <param name="genetreeFile" type="data" format="nhx" label="GeneTree file" help="GeneTree in nhx format" /> - <param name="gene_node" type="select" label="Select Gene node format" help="Select Gene node format from one of the option"> + <param name="gene_node" type="select" label="Select Gene node format" help="Select Gene node format from one of the option"> <option value="0" selected="true">gene_species</option> <option value="1">species_gene</option> </param> - <param name="splitter" type="select" label="GeneTree splitting algorithm"> - <option value="dups">Split by Duplication</option> - <option value="treeko">Split using TreeKO algorithm</option> - </param> + <conditional name="splitting_conditional"> + <param name="splitter" type="select" label="GeneTree splitting algorithm"> + <option value="dups">Split by Duplication</option> + <option value="treeko">Split using TreeKO algorithm</option> + <option value="species">Split using In-group and Out-group species</option> + </param> + <when value="dups" /> + <when value="treeko" /> + <when value="species"> + <param name="ingroup" type="text" label="In-group species list" help="Comma-separated species list without whitespaces or special characters" /> + <param name="outgroup" type="text" label="Out-group species list" help="Comma-separated species list without whitespaces or special characters" /> + </when> + </conditional> <conditional name="gainlose_conditional"> <param name="gainlose" type="select" label="Find out gene gain/lose"> <option value="True">Yes</option> @@ -107,13 +120,28 @@ <param name="gene_node" value="0"/> <param name="splitter" value="treeko"/> <param name="output_format" value="9" /> - <output_collection name="genetrees_lists" type="list" count="5"> + <output_collection name="genetrees_lists" type="list" count="4"> <element name="1" file="31_genetree.nhx" ftype="nhx" /> <element name="2" file="32_genetree.nhx" ftype="nhx" /> <element name="3" file="33_genetree.nhx" ftype="nhx" /> <element name="4" file="34_genetree.nhx" ftype="nhx" /> </output_collection> </test> + + <test> + <param name="genetreeFile" ftype="nhx" value="genetree.nhx" /> + <param name="gene_node" value="0"/> + <param name="splitter" value="species"/> + <param name="ingroup" value="pantroglodytes,homosapiens,rattusnorvegicus,musmusculus,canisfamiliaris" /> + <param name="outgroup" value="susscrofa" /> + <param name="output_format" value="9" /> + <output_collection name="genetrees_lists" type="list" count="4"> + <element name="1" file="41_genetree.nhx" ftype="nhx" /> + <element name="2" file="42_genetree.nhx" ftype="nhx" /> + <element name="3" file="43_genetree.nhx" ftype="nhx" /> + <element name="4" file="44_genetree.nhx" ftype="nhx" /> + </output_collection> + </test> </tests> <help><![CDATA[ Split GeneTrees from single GeneTree by duplication event using the `ETE Toolkit`_.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/41_genetree.nhx Tue Jun 07 08:58:05 2022 +0000 @@ -0,0 +1,1 @@ +(((insr_rattusnorvegicus,insr_musmusculus),(insr_homosapiens,insr_pantroglodytes)),insr_susscrofa); \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/42_genetree.nhx Tue Jun 07 08:58:05 2022 +0000 @@ -0,0 +1,1 @@ +((maob_rattusnorvegicus,maob_musmusculus),((maob_homosapiens,maob_pantroglodytes),(maob_susscrofa,maob_canisfamiliaris))); \ No newline at end of file