Mercurial > repos > earlhaminst > ete

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ete_gene_cnv.py	Thu Oct 31 07:48:59 2019 -0400
@@ -0,0 +1,75 @@
+from __future__ import print_function
+
+import argparse
+import collections
+
+from ete3 import PhyloTree
+
+
+def printTSV(myDict, colList=None):
+    """ Pretty print a list of dictionaries (myDict) as a dynamically sized table.
+    If column names (colList) aren't specified, they will show in random order.
+    Author: Thierry Husson - Use it as you want but don't blame me.
+    """
+    if not colList:
+        colList = list(myDict[0].keys() if myDict else [])
+
+    myList = [colList]
+
+    for item in myDict:
+        myList.append([str(item[col] if item[col] is not None else '') for col in colList])
+
+    for item in myList:
+        print(*item, sep="\t")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
+    parser.add_argument('--genetree', required=True, help='GeneTree in nhx format')
+    parser.add_argument('--speciesorder', required=True, help='Comma-separated species list')
+    args = parser.parse_args()
+
+    species_list = args.speciesorder.split(",")
+    species_list = [_.strip() for _ in species_list]
+    table = []
+
+    with open(args.genetree, "r") as f:
+        # reads multiple gene tree line by line gene tree
+        for line in f:
+            # Remove empty NHX features that can be produced by TreeBest but break ete3
+            line = line.replace('[&&NHX]', '')
+
+            # reads single gene tree
+            genetree = PhyloTree(line)
+            leaves = genetree.get_leaf_names()
+
+            leaves_parts = [_.split("_") for _ in leaves]
+            for i, leaf_parts in enumerate(leaves_parts):
+                if len(leaf_parts) != 2:
+                    raise Exception("Leaf node '%s' is not in gene_species format" % leaves[i])
+
+            leaves_species = [_[1] for _ in leaves_parts]
+            species_counter = collections.Counter(leaves_species)
+
+            # Assign to ref_species the first element of species_list which
+            # appears in a leaf node
+            for ref_species in species_list:
+                if ref_species in species_counter:
+                    break
+            else:
+                raise Exception("None of the specified species was found in the GeneTree '%s'" % line)
+
+            # Find the gene of the (first) leaf node for the ref_species
+            for leaf_parts in leaves_parts:
+                if leaf_parts[1] == ref_species:
+                    species_counter['gene'] = leaf_parts[0]
+                    break
+
+            table.append(species_counter)
+
+    colList = ["gene"] + species_list
+    printTSV(table, colList)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ete_gene_cnv.xml	Thu Oct 31 07:48:59 2019 -0400
@@ -0,0 +1,33 @@
+<tool id="ete_gene_csv_finder" name="Gene Copy Number Finder" version="@VERSION@">
+    <description>from a genetree using the ETE Toolkit</description>
+    <macros>
+        <import>ete_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/ete_gene_cnv.py'
+--genetree '$genetreeFile'
+--speciesorder '$speciesorder'
+> '$genes'
+    ]]></command>
+    <inputs>
+        <param name="genetreeFile" type="data" format="nhx" label="GeneTree file" help="GeneTree in nhx format, where nodes are in form of geneid_species" />
+        <param name="speciesorder" type="text" label="Species in order" help="Comma-separated species list" />
+    </inputs>
+    <outputs>
+        <data name="genes" format="tabular" label="${tool.name} on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="genetreeFile" ftype="nhx" value="test.nhx" />
+            <param name="speciesorder" value="w,x,y,z,zz" />
+            <output name="genes" file="test.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+Find copy number for genes from GeneTree by utilising the `ETE Toolkit`_. Input can be single GeneTree or multiple GeneTree in a file with one GeneTree per line.
+
+.. _ETE Toolkit: http://etetoolkit.org/
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- a/ete_homology_classifier.xml	Thu Oct 11 11:52:28 2018 -0400
+++ b/ete_homology_classifier.xml	Thu Oct 31 07:48:59 2019 -0400
@@ -1,5 +1,5 @@
 <tool id="ete_homology_classifier" name="Homology Classifier and Filter" version="@VERSION@">
-    <description>from a genetree utilising the ETE Toolkit</description>
+    <description>from a genetree using the ETE Toolkit</description>
     <macros>
         <import>ete_macros.xml</import>
         <xml name="homologies_macro" token_label="" token_help="">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.nhx	Thu Oct 31 07:48:59 2019 -0400
@@ -0,0 +1,3 @@
+(((a_w,a_x),(a_y,a_z)),a_zz);
+(((a_w,a_w),(a_y,a_z)),a_zz);
+(((a_w,a_x),(a_y,a_y)),a_zz);
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.tsv	Thu Oct 31 07:48:59 2019 -0400
@@ -0,0 +1,4 @@
+gene	w	x	y	z	zz
+a	1	1	1	1	1
+a	2	0	1	1	1
+a	1	1	2	0	1