Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff phylogenies/phytab_raxml.py @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +## +## This tool runs RAxML on each partition/gene in a PHYTAB file. +## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the +## number of processors ('ppn') in the job runner matches the 'numthreads' value set in command line arguement. +## +## Usage: ./phytab_raxml.py -i <phytabinput> -e <model> -f <modelfile> -T 4 +## example: ./phytab_raxml.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4 +## or: ./phytab_raxml.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4 + +import optparse +import os +import subprocess +import multiprocessing + +RESULTS_DIR = 'results' +RESULTS_FILE = 'results.phy' +RAXML_PREFIX = 'RAxML_result.' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +class Species: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.gene = lis[1] + self.name = lis[2] + self.sequence = lis[3] + + def toString(self): + return self.species + '\t' + self.sequence + + +class Gene: + def __init__(self, name): + self.name = name + self.count = 0 + self.length = 0 + self.species = [] + + def output(self): + file_name = self.name + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'w') as f: + f.write(str(self.count) + '\t' + str(self.length) + '\n') + for s in self.species: + f.write(s.toString()) + return file_name + + def add(self, species): + if species.name == "": + return + self.species.append(species) + self.count += 1 + if self.length == 0: + self.length = len(species.sequence) - 1 + + +def output_species(species): + file_name = species.gene + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'a') as f: + f.write(species.toString()) + return file_name + + +def process_phytab(input): + files = set() + genes = dict() + with open(input) as f: + for line in f: + if len(line) < 4: + continue + species = Species(line) + if species.gene in genes: + genes[species.gene].add(species) + else: + gene = Gene(species.gene) + gene.add(species) + genes[gene.name] = gene + for k, gene in genes.iteritems(): + files.add(gene.output()) + return files + + +def runRaxml(list_of_files, evo, evoDict, NUMTHREADS): + count = 0 + list_of_files = sorted(list_of_files) + for gene_file in list_of_files: + count+=1 + if gene_file.split(".")[0] in evoDict: + newEvo = evoDict[gene_file.split(".")[0]] + else: + newEvo = evo + file_name = RESULTS_DIR + os.sep + gene_file +## RAxML notes: ## +# to run parsimony trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34']) +# to run likelihood trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']) +# to run likelihood trees using starting tree: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre]) + +## run trees as simultaneously as possible--but wait until last tree completes to resume script + raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'] + if count == len(list_of_files): + run = subprocess.Popen(raxml_cmd) + run.wait() + else: + run = subprocess.Popen(raxml_cmd) + run.communicate()[0] + + +def readEfile(efile): + evoDict = {} + with open(efile, "r") as f: + for line in f: + pair = line.split("\t") + evoDict[pair[0].strip()] = pair[1].strip() + return evoDict + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-e', '--evo', + dest='evo', + action='store', + type='string', + metavar="EVO", + help='Evolution model.') + + parser.add_option( + '-f', '--evo-file', + dest='efile', + action='store', + type='string', + metavar="EVO_FILE", + help='Evolution model file. Format is gene_name [tab] evolution_model.') + + parser.add_option( + '-T', '--numthreads', + dest='numthreads', + action='store', + type='int', + metavar="NUMT", + help='Specify number of threads.') + + options, args = parser.parse_args() + + os.mkdir(RESULTS_DIR) + + list_of_species_files = process_phytab(unescape(options.input)) + + try: + evoDict = readEfile(unescape(options.efile)) + except IOError: + print "No sequence model file provide...using", unescape(options.evo), "as the model" + evoDict = {} + + runRaxml(list_of_species_files, unescape(options.evo), evoDict, str(options.numthreads)) + + result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)] + result = sorted(result) + with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f: + for file in result: + with open(file, "r") as r: + f.write(file[len(RAXML_PREFIX):] + '\t' + r.read()) + +if __name__ == '__main__': + main() +