changeset 12:dc32007a6b36 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit c568584f1eaa1366603b89db7e52994812f5d387
author earlhaminst
date Tue, 07 Jun 2022 08:58:05 +0000
parents 2db72467da51
children ed74587a13c8
files ete_genetree_splitter.py ete_genetree_splitter.xml test-data/41_genetree.nhx test-data/42_genetree.nhx test-data/43_genetree.nhx test-data/44_genetree.nhx
diffstat 6 files changed, 122 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/ete_genetree_splitter.py	Thu Mar 10 14:01:44 2022 +0000
+++ b/ete_genetree_splitter.py	Tue Jun 07 08:58:05 2022 +0000
@@ -1,6 +1,8 @@
 from __future__ import print_function
 
 import optparse
+import os
+import sys
 
 from ete3 import PhyloTree
 
@@ -10,16 +12,26 @@
     parser = optparse.OptionParser(usage=usage)
     parser.add_option('--genetree', help='GeneTree in nhx format')
     parser.add_option('--speciestree', help='Species Tree in nhx format')
+    parser.add_option('--ingroup', help='Species Tree in nhx format')
+    parser.add_option('--outgroup', help='Species Tree in nhx format')
     parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)')
     parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene')
     parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose')
-    parser.add_option('--split', type='choice', choices=['dups', 'treeko'], dest="split", default='dups', help='Choose GeneTree splitting algorithms')
+    parser.add_option('--split', type='choice', choices=['dups', 'treeko', 'species'], dest="split", default='dups', help='Choose GeneTree splitting algorithms')
     parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)')
+    parser.add_option('-d', '--dir', type='string', default="", help="Absolute or relative path to output directory. If directory does not exist it will be created")
+
     options, args = parser.parse_args()
 
+    if options.dir and not os.path.exists(options.dir):
+        os.makedirs(options.dir)
+
     if options.genetree is None:
         parser.error("--genetree option must be specified, GeneTree in nhx format")
 
+    if os.stat(options.genetree).st_size == 0:
+        sys.exit()
+
     with open(options.genetree, 'r') as f:
         contents = f.read()
 
@@ -35,7 +47,6 @@
 
     # reconcile species tree with gene tree to help find out gene gain/lose
     if options.gainlose:
-
         if options.speciestree is None:
             parser.error("--speciestree option must be specified, species tree in nhx format")
 
@@ -50,24 +61,86 @@
 
     if options.split == "dups":
         # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes.
-        for cluster_id, node in enumerate(genetree.split_by_dups(), 1):
-            outfile = str(cluster_id) + '_genetree.nhx'
+        for cluster_id, node in enumerate(genetree.split_by_dups(), start=1):
+            outfile = '{}_genetree.nhx'.format(cluster_id)
+            if options.dir:
+                outfile = os.path.join(options.dir, outfile)
             with open(outfile, 'w') as f:
                 f.write(node.write(format=options.output_format))
     elif options.split == "treeko":
         # splits tree using the TreeKO algorithm.
         ntrees, ndups, sptrees = genetree.get_speciation_trees()
 
-        cluster_id = 0
-        for spt in sptrees:
-            cluster_id = cluster_id + 1
-            outfile = str(cluster_id) + '_genetree.nhx'
+        for cluster_id, spt in enumerate(sptrees, start=1):
+            outfile = '{}_genetree.nhx'.format(cluster_id)
+            if options.dir:
+                outfile = os.path.join(options.dir, outfile)
             with open(outfile, 'w') as f:
                 f.write(spt.write(format=options.output_format))
+    elif options.split == "species":
+        ingroup = options.ingroup.split(",")
+        outgroup = options.outgroup.split(",")
+        cluster_id = 0
+
+        def split_tree_by_species(tree, ingroup, outgroup):
+            nonlocal cluster_id
+
+            if len(outgroup) > 0:
+                outgroup_bool = check_outgroup(tree, outgroup)
+            else:
+                outgroup_bool = True
+
+            if outgroup_bool and check_ingroup(tree, ingroup):
+                child1, child2 = tree.children
+                split_tree_by_species(child1, ingroup, outgroup)
+                split_tree_by_species(child2, ingroup, outgroup)
+            else:
+                cluster_id += 1
+                outfile = '{}_genetree.nhx'.format(cluster_id)
+                if options.dir:
+                    outfile = os.path.join(options.dir, outfile)
+                with open(outfile, 'w') as f:
+                    f.write(tree.write(format=options.output_format))
+
+        split_tree_by_species(genetree, ingroup, outgroup)
+
+
+def check_outgroup(tree, outgroup):
+    species = get_species(tree)
+
+    count = 0
+
+    for out in outgroup:
+        if species.count(out) > 1:
+            count = count + 1
+
+    return count >= len(outgroup) / 2
+
+
+def check_ingroup(tree, ingroup):
+    species = get_species(tree)
+
+    count = 0
+
+    for ing in ingroup:
+        if species.count(ing) > 1:
+            count = count + 1
+
+    return count > 0 and len(ingroup) / count >= 0.8
 
 
 def parse_sp_name(node_name):
-    return node_name.split("_")[1]
+    return node_name.split("_")[-1]
+
+
+def get_species(node):
+    leaves_list = node.get_leaf_names()
+    # Genetree nodes are required to be in gene_species format
+    leaves_list = [_ for _ in leaves_list if '_' in _]
+
+    species_list = [_.split("_")[-1] for _ in leaves_list]
+
+    return species_list
 
 
 if __name__ == "__main__":
--- a/ete_genetree_splitter.xml	Thu Mar 10 14:01:44 2022 +0000
+++ b/ete_genetree_splitter.xml	Tue Jun 07 08:58:05 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="ete_genetree_splitter" name="ETE GeneTree splitter" version="@VERSION@">
+<tool id="ete_genetree_splitter" name="ETE GeneTree splitter" version="@VERSION@+galaxy1">
     <description>from a genetree using the ETE Toolkit</description>
     <macros>
         <import>ete_macros.xml</import>
@@ -12,24 +12,37 @@
 python '$__tool_directory__/ete_genetree_splitter.py'
 --genetree '$genetreeFile'
 --gene_node $gene_node
---split $splitter
+--split $splitting_conditional.splitter
 #if $gainlose_conditional.gainlose == "True"
     --speciestree '$gainlose_conditional.speciesFile'
     --species_format $gainlose_conditional.species_format
     --gainlose
 #end if
+#if $splitting_conditional.splitter == "species"
+    --ingroup '$splitting_conditional.ingroup'
+    --outgroup '$splitting_conditional.outgroup'
+#end if
 --output_format $output_format
     ]]></command>
     <inputs>
         <param name="genetreeFile" type="data" format="nhx" label="GeneTree file" help="GeneTree in nhx format" />
-         <param name="gene_node" type="select" label="Select Gene node format" help="Select Gene node format from one of the option">
+        <param name="gene_node" type="select" label="Select Gene node format" help="Select Gene node format from one of the option">
             <option value="0" selected="true">gene_species</option>
             <option value="1">species_gene</option>
         </param>
-        <param name="splitter" type="select" label="GeneTree splitting algorithm">
-            <option value="dups">Split by Duplication</option>
-            <option value="treeko">Split using TreeKO algorithm</option>
-        </param>
+        <conditional name="splitting_conditional">
+            <param name="splitter" type="select" label="GeneTree splitting algorithm">
+                <option value="dups">Split by Duplication</option>
+                <option value="treeko">Split using TreeKO algorithm</option>
+                <option value="species">Split using In-group and Out-group species</option>
+            </param>
+            <when value="dups" />
+            <when value="treeko" />
+            <when value="species">
+                <param name="ingroup" type="text" label="In-group species list" help="Comma-separated species list without whitespaces or special characters" />
+                <param name="outgroup" type="text" label="Out-group species list" help="Comma-separated species list without whitespaces or special characters" />
+            </when>
+        </conditional>
         <conditional name="gainlose_conditional">
             <param name="gainlose" type="select" label="Find out gene gain/lose">
                 <option value="True">Yes</option>
@@ -107,13 +120,28 @@
             <param name="gene_node" value="0"/>
             <param name="splitter" value="treeko"/>
             <param name="output_format" value="9" />
-            <output_collection name="genetrees_lists" type="list" count="5">
+            <output_collection name="genetrees_lists" type="list" count="4">
                 <element name="1" file="31_genetree.nhx" ftype="nhx" />
                 <element name="2" file="32_genetree.nhx" ftype="nhx" />
                 <element name="3" file="33_genetree.nhx" ftype="nhx" />
                 <element name="4" file="34_genetree.nhx" ftype="nhx" />
             </output_collection>
         </test>
+
+        <test>
+            <param name="genetreeFile" ftype="nhx" value="genetree.nhx" />
+            <param name="gene_node" value="0"/>
+            <param name="splitter" value="species"/>
+            <param name="ingroup" value="pantroglodytes,homosapiens,rattusnorvegicus,musmusculus,canisfamiliaris" />
+            <param name="outgroup" value="susscrofa" />
+            <param name="output_format" value="9" />
+            <output_collection name="genetrees_lists" type="list" count="4">
+                <element name="1" file="41_genetree.nhx" ftype="nhx" />
+                <element name="2" file="42_genetree.nhx" ftype="nhx" />
+                <element name="3" file="43_genetree.nhx" ftype="nhx" />
+                <element name="4" file="44_genetree.nhx" ftype="nhx" />
+            </output_collection>
+        </test>
     </tests>
     <help><![CDATA[
 Split GeneTrees from single GeneTree by duplication event using the `ETE Toolkit`_.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/41_genetree.nhx	Tue Jun 07 08:58:05 2022 +0000
@@ -0,0 +1,1 @@
+(((insr_rattusnorvegicus,insr_musmusculus),(insr_homosapiens,insr_pantroglodytes)),insr_susscrofa);
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/42_genetree.nhx	Tue Jun 07 08:58:05 2022 +0000
@@ -0,0 +1,1 @@
+((maob_rattusnorvegicus,maob_musmusculus),((maob_homosapiens,maob_pantroglodytes),(maob_susscrofa,maob_canisfamiliaris)));
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/43_genetree.nhx	Tue Jun 07 08:58:05 2022 +0000
@@ -0,0 +1,1 @@
+(((maoa_rattusnorvegicus,maoa_musmusculus),(maoa_homosapiens,maoa_pantroglodytes)),(maoa_susscrofa,maoa_canisfamiliaris));
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/44_genetree.nhx	Tue Jun 07 08:58:05 2022 +0000
@@ -0,0 +1,1 @@
+(((brat1_rattusnorvegicus,brat1_musmusculus),(brat1_homosapiens,brat1_pantroglodytes)),(brat1_susscrofa,brat1_canisfamiliaris));
\ No newline at end of file