Mercurial > repos > earlhaminst > ete
changeset 8:16e925bf567e draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/ete commit ed32f2e6d8174873cefcbe141084f857f84b0586"
author | earlhaminst |
---|---|
date | Thu, 31 Oct 2019 07:48:59 -0400 |
parents | 6a5282f71f82 |
children | b29ee6a16524 |
files | ete_gene_cnv.py ete_gene_cnv.xml ete_homology_classifier.xml test-data/test.nhx test-data/test.tsv |
diffstat | 5 files changed, 116 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ete_gene_cnv.py Thu Oct 31 07:48:59 2019 -0400 @@ -0,0 +1,75 @@ +from __future__ import print_function + +import argparse +import collections + +from ete3 import PhyloTree + + +def printTSV(myDict, colList=None): + """ Pretty print a list of dictionaries (myDict) as a dynamically sized table. + If column names (colList) aren't specified, they will show in random order. + Author: Thierry Husson - Use it as you want but don't blame me. + """ + if not colList: + colList = list(myDict[0].keys() if myDict else []) + + myList = [colList] + + for item in myDict: + myList.append([str(item[col] if item[col] is not None else '') for col in colList]) + + for item in myList: + print(*item, sep="\t") + + +def main(): + parser = argparse.ArgumentParser(description='Gene Copy Number Finder') + parser.add_argument('--genetree', required=True, help='GeneTree in nhx format') + parser.add_argument('--speciesorder', required=True, help='Comma-separated species list') + args = parser.parse_args() + + species_list = args.speciesorder.split(",") + species_list = [_.strip() for _ in species_list] + table = [] + + with open(args.genetree, "r") as f: + # reads multiple gene tree line by line gene tree + for line in f: + # Remove empty NHX features that can be produced by TreeBest but break ete3 + line = line.replace('[&&NHX]', '') + + # reads single gene tree + genetree = PhyloTree(line) + leaves = genetree.get_leaf_names() + + leaves_parts = [_.split("_") for _ in leaves] + for i, leaf_parts in enumerate(leaves_parts): + if len(leaf_parts) != 2: + raise Exception("Leaf node '%s' is not in gene_species format" % leaves[i]) + + leaves_species = [_[1] for _ in leaves_parts] + species_counter = collections.Counter(leaves_species) + + # Assign to ref_species the first element of species_list which + # appears in a leaf node + for ref_species in species_list: + if ref_species in species_counter: + break + else: + raise Exception("None of the specified species was found in the GeneTree '%s'" % line) + + # Find the gene of the (first) leaf node for the ref_species + for leaf_parts in leaves_parts: + if leaf_parts[1] == ref_species: + species_counter['gene'] = leaf_parts[0] + break + + table.append(species_counter) + + colList = ["gene"] + species_list + printTSV(table, colList) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ete_gene_cnv.xml Thu Oct 31 07:48:59 2019 -0400 @@ -0,0 +1,33 @@ +<tool id="ete_gene_csv_finder" name="Gene Copy Number Finder" version="@VERSION@"> + <description>from a genetree using the ETE Toolkit</description> + <macros> + <import>ete_macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/ete_gene_cnv.py' +--genetree '$genetreeFile' +--speciesorder '$speciesorder' +> '$genes' + ]]></command> + <inputs> + <param name="genetreeFile" type="data" format="nhx" label="GeneTree file" help="GeneTree in nhx format, where nodes are in form of geneid_species" /> + <param name="speciesorder" type="text" label="Species in order" help="Comma-separated species list" /> + </inputs> + <outputs> + <data name="genes" format="tabular" label="${tool.name} on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="genetreeFile" ftype="nhx" value="test.nhx" /> + <param name="speciesorder" value="w,x,y,z,zz" /> + <output name="genes" file="test.tsv" /> + </test> + </tests> + <help><![CDATA[ +Find copy number for genes from GeneTree by utilising the `ETE Toolkit`_. Input can be single GeneTree or multiple GeneTree in a file with one GeneTree per line. + +.. _ETE Toolkit: http://etetoolkit.org/ + ]]></help> + <expand macro="citations" /> +</tool>
--- a/ete_homology_classifier.xml Thu Oct 11 11:52:28 2018 -0400 +++ b/ete_homology_classifier.xml Thu Oct 31 07:48:59 2019 -0400 @@ -1,5 +1,5 @@ <tool id="ete_homology_classifier" name="Homology Classifier and Filter" version="@VERSION@"> - <description>from a genetree utilising the ETE Toolkit</description> + <description>from a genetree using the ETE Toolkit</description> <macros> <import>ete_macros.xml</import> <xml name="homologies_macro" token_label="" token_help="">