comparison phylogenies/phytab_raxml.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/env python
2
3 ##
4 ## This tool runs RAxML on each partition/gene in a PHYTAB file.
5 ## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the
6 ## number of processors ('ppn') in the job runner matches the 'numthreads' value set in command line arguement.
7 ##
8 ## Usage: ./phytab_raxml.py -i <phytabinput> -e <model> -f <modelfile> -T 4
9 ## example: ./phytab_raxml.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4
10 ## or: ./phytab_raxml.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4
11
12 import optparse
13 import os
14 import subprocess
15 import multiprocessing
16
17 RESULTS_DIR = 'results'
18 RESULTS_FILE = 'results.phy'
19 RAXML_PREFIX = 'RAxML_result.'
20
21 def unescape(string):
22 mapped_chars = {
23 '>': '__gt__',
24 '<': '__lt__',
25 "'": '__sq__',
26 '"': '__dq__',
27 '[': '__ob__',
28 ']': '__cb__',
29 '{': '__oc__',
30 '}': '__cc__',
31 '@': '__at__',
32 '\n': '__cn__',
33 '\r': '__cr__',
34 '\t': '__tc__',
35 '#': '__pd__'
36 }
37
38 for key, value in mapped_chars.iteritems():
39 string = string.replace(value, key)
40
41 return string
42
43
44 class Species:
45 def __init__(self, string):
46 lis = string.split('\t')
47 # print lis
48 self.species = lis[0]
49 self.gene = lis[1]
50 self.name = lis[2]
51 self.sequence = lis[3]
52
53 def toString(self):
54 return self.species + '\t' + self.sequence
55
56
57 class Gene:
58 def __init__(self, name):
59 self.name = name
60 self.count = 0
61 self.length = 0
62 self.species = []
63
64 def output(self):
65 file_name = self.name + ".phy"
66 location = RESULTS_DIR + os.sep + file_name
67 with open(location, 'w') as f:
68 f.write(str(self.count) + '\t' + str(self.length) + '\n')
69 for s in self.species:
70 f.write(s.toString())
71 return file_name
72
73 def add(self, species):
74 if species.name == "":
75 return
76 self.species.append(species)
77 self.count += 1
78 if self.length == 0:
79 self.length = len(species.sequence) - 1
80
81
82 def output_species(species):
83 file_name = species.gene + ".phy"
84 location = RESULTS_DIR + os.sep + file_name
85 with open(location, 'a') as f:
86 f.write(species.toString())
87 return file_name
88
89
90 def process_phytab(input):
91 files = set()
92 genes = dict()
93 with open(input) as f:
94 for line in f:
95 if len(line) < 4:
96 continue
97 species = Species(line)
98 if species.gene in genes:
99 genes[species.gene].add(species)
100 else:
101 gene = Gene(species.gene)
102 gene.add(species)
103 genes[gene.name] = gene
104 for k, gene in genes.iteritems():
105 files.add(gene.output())
106 return files
107
108
109 def runRaxml(list_of_files, evo, evoDict, NUMTHREADS):
110 count = 0
111 list_of_files = sorted(list_of_files)
112 for gene_file in list_of_files:
113 count+=1
114 if gene_file.split(".")[0] in evoDict:
115 newEvo = evoDict[gene_file.split(".")[0]]
116 else:
117 newEvo = evo
118 file_name = RESULTS_DIR + os.sep + gene_file
119 ## RAxML notes: ##
120 # to run parsimony trees:
121 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34'])
122 # to run likelihood trees:
123 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'])
124 # to run likelihood trees using starting tree:
125 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre])
126
127 ## run trees as simultaneously as possible--but wait until last tree completes to resume script
128 raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']
129 if count == len(list_of_files):
130 run = subprocess.Popen(raxml_cmd)
131 run.wait()
132 else:
133 run = subprocess.Popen(raxml_cmd)
134 run.communicate()[0]
135
136
137 def readEfile(efile):
138 evoDict = {}
139 with open(efile, "r") as f:
140 for line in f:
141 pair = line.split("\t")
142 evoDict[pair[0].strip()] = pair[1].strip()
143 return evoDict
144
145 def main():
146 usage = """%prog [options]
147 options (listed below) default to 'None' if omitted
148 """
149 parser = optparse.OptionParser(usage=usage)
150
151 parser.add_option(
152 '-i', '--in',
153 dest='input',
154 action='store',
155 type='string',
156 metavar="FILE",
157 help='Name of input data.')
158
159 parser.add_option(
160 '-e', '--evo',
161 dest='evo',
162 action='store',
163 type='string',
164 metavar="EVO",
165 help='Evolution model.')
166
167 parser.add_option(
168 '-f', '--evo-file',
169 dest='efile',
170 action='store',
171 type='string',
172 metavar="EVO_FILE",
173 help='Evolution model file. Format is gene_name [tab] evolution_model.')
174
175 parser.add_option(
176 '-T', '--numthreads',
177 dest='numthreads',
178 action='store',
179 type='int',
180 metavar="NUMT",
181 help='Specify number of threads.')
182
183 options, args = parser.parse_args()
184
185 os.mkdir(RESULTS_DIR)
186
187 list_of_species_files = process_phytab(unescape(options.input))
188
189 try:
190 evoDict = readEfile(unescape(options.efile))
191 except IOError:
192 print "No sequence model file provide...using", unescape(options.evo), "as the model"
193 evoDict = {}
194
195 runRaxml(list_of_species_files, unescape(options.evo), evoDict, str(options.numthreads))
196
197 result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)]
198 result = sorted(result)
199 with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f:
200 for file in result:
201 with open(file, "r") as r:
202 f.write(file[len(RAXML_PREFIX):] + '\t' + r.read())
203
204 if __name__ == '__main__':
205 main()
206