diff phylogenies/phytab_raxml.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phylogenies/phytab_raxml.py	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+##
+##  This tool runs RAxML on each partition/gene in a PHYTAB file.
+##  If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously.  Make sure that the
+##  number of processors ('ppn') in the job runner matches the 'numthreads' value set in command line arguement.
+## 
+##  Usage:   ./phytab_raxml.py -i <phytabinput> -e <model> -f <modelfile> -T 4
+##  example: ./phytab_raxml.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4
+##  or:      ./phytab_raxml.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4
+
+import optparse
+import os
+import subprocess
+import multiprocessing
+
+RESULTS_DIR = 'results'
+RESULTS_FILE = 'results.phy'
+RAXML_PREFIX = 'RAxML_result.'
+
+def unescape(string):
+    mapped_chars = {
+        '>': '__gt__',
+        '<': '__lt__',
+        "'": '__sq__',
+        '"': '__dq__',
+        '[': '__ob__',
+        ']': '__cb__',
+        '{': '__oc__',
+        '}': '__cc__',
+        '@': '__at__',
+        '\n': '__cn__',
+        '\r': '__cr__',
+        '\t': '__tc__',
+        '#': '__pd__'
+        }
+
+    for key, value in mapped_chars.iteritems():
+        string = string.replace(value, key)
+
+    return string
+
+
+class Species:
+    def __init__(self, string):
+        lis = string.split('\t')
+        # print lis
+        self.species = lis[0]
+        self.gene = lis[1]
+        self.name = lis[2]
+        self.sequence = lis[3]
+
+    def toString(self):
+        return self.species + '\t' + self.sequence
+
+
+class Gene:
+    def __init__(self, name):
+        self.name = name
+        self.count = 0
+        self.length = 0
+        self.species = []
+
+    def output(self):
+        file_name = self.name + ".phy"
+        location = RESULTS_DIR + os.sep + file_name
+        with open(location, 'w') as f:
+            f.write(str(self.count) + '\t' + str(self.length) + '\n')
+            for s in self.species:
+                f.write(s.toString())
+        return file_name
+
+    def add(self, species):
+        if species.name == "":
+            return
+        self.species.append(species)
+        self.count += 1
+        if self.length == 0:
+            self.length = len(species.sequence) - 1
+
+
+def output_species(species):
+    file_name = species.gene + ".phy"
+    location = RESULTS_DIR + os.sep + file_name
+    with open(location, 'a') as f:
+        f.write(species.toString())
+    return file_name
+
+
+def process_phytab(input):
+    files = set()
+    genes = dict()
+    with open(input) as f:
+        for line in f:
+            if len(line) < 4:
+                continue
+            species = Species(line)
+            if species.gene in genes:
+                genes[species.gene].add(species)
+            else:
+                gene = Gene(species.gene)
+                gene.add(species)
+                genes[gene.name] = gene
+    for k, gene in genes.iteritems():
+        files.add(gene.output())
+    return files
+
+
+def runRaxml(list_of_files, evo, evoDict, NUMTHREADS):
+  count = 0
+  list_of_files = sorted(list_of_files)
+  for gene_file in list_of_files:
+    count+=1            
+    if gene_file.split(".")[0] in evoDict:
+      newEvo = evoDict[gene_file.split(".")[0]]
+    else:
+      newEvo = evo
+    file_name = RESULTS_DIR + os.sep + gene_file
+## RAxML notes:  ##  
+# to run parsimony trees:
+#          popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m',  newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34'])
+# to run likelihood trees:
+#          popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'])
+# to run likelihood trees using starting tree:
+#    popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre])   
+
+## run trees as simultaneously as possible--but wait until last tree completes to resume script
+    raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']
+    if count == len(list_of_files):
+      run = subprocess.Popen(raxml_cmd)
+      run.wait()
+    else:
+      run = subprocess.Popen(raxml_cmd)
+      run.communicate()[0]
+
+
+def readEfile(efile):
+    evoDict = {}
+    with open(efile, "r") as f:
+        for line in f:
+            pair = line.split("\t")
+            evoDict[pair[0].strip()] = pair[1].strip()
+    return evoDict
+
+def main():
+    usage = """%prog [options]
+options (listed below) default to 'None' if omitted
+    """
+    parser = optparse.OptionParser(usage=usage)
+
+    parser.add_option(
+        '-i', '--in',
+        dest='input',
+        action='store',
+        type='string',
+        metavar="FILE",
+        help='Name of input data.')
+
+    parser.add_option(
+        '-e', '--evo',
+        dest='evo',
+        action='store',
+        type='string',
+        metavar="EVO",
+        help='Evolution model.')
+
+    parser.add_option(
+        '-f', '--evo-file',
+        dest='efile',
+        action='store',
+        type='string',
+        metavar="EVO_FILE",
+        help='Evolution model file. Format is gene_name [tab] evolution_model.')
+
+    parser.add_option(
+        '-T', '--numthreads',
+        dest='numthreads',
+        action='store',
+        type='int',
+        metavar="NUMT",
+        help='Specify number of threads.')
+        
+    options, args = parser.parse_args()
+
+    os.mkdir(RESULTS_DIR)
+
+    list_of_species_files = process_phytab(unescape(options.input))
+
+    try:
+        evoDict = readEfile(unescape(options.efile))
+    except IOError:
+        print "No sequence model file provide...using", unescape(options.evo), "as the model" 
+        evoDict = {}
+   
+    runRaxml(list_of_species_files, unescape(options.evo), evoDict, str(options.numthreads))
+    
+    result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)]
+    result = sorted(result)
+    with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f:
+        for file in result:
+            with open(file, "r") as r:
+                f.write(file[len(RAXML_PREFIX):] + '\t' + r.read())
+
+if __name__ == '__main__':
+    main()
+