Mercurial > repos > miller-lab > genome_diversity
comparison phylogenetic_tree.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 248b06e86022 |
children |
comparison
equal
deleted
inserted
replaced
26:91e835060ad2 | 27:8997f2ca8c7a |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import gd_util | |
3 import os | 4 import os |
4 import errno | |
5 import sys | 5 import sys |
6 import subprocess | |
7 import shutil | |
8 from Population import Population | 6 from Population import Population |
9 import gd_composite | 7 import gd_composite |
10 | 8 |
11 ################################################################################ | 9 ################################################################################ |
12 | 10 |
13 def mkdir_p(path): | 11 if len(sys.argv) != 12: |
14 try: | 12 gd_util.die('Usage') |
15 os.makedirs(path) | |
16 except OSError, e: | |
17 if e.errno <> errno.EEXIST: | |
18 raise | |
19 | 13 |
20 ################################################################################ | 14 input, output, extra_files_path, input_type, data_source_arg, minimum_coverage, minimum_quality, p1_input, dbkey, draw_tree_options, ind_arg = sys.argv[1:] |
21 | |
22 # <command interpreter="python"> | |
23 # phylogenetic_tree.py "$input" "$output" "$output.files_path" | |
24 # | |
25 # #if $input_type.choice == '0' | |
26 # "gd_snp" | |
27 # #if $input_type.data_source.choice == '0' | |
28 # "sequence_coverage" | |
29 # "$input_type.data_source.minimum_coverage" | |
30 # "$input_type.data_source.minimum_quality" | |
31 # #else if $input_type.data_source.choice == '1' | |
32 # "estimated_genotype" | |
33 # #else if $input_type.choice == '1' | |
34 # "gd_genotype" | |
35 # #end if | |
36 # | |
37 # #if $individuals.choice == '0' | |
38 # "all_individuals" | |
39 # #else if $individuals.choice == '1' | |
40 # "$individuals.p1_input" | |
41 # #end if | |
42 # | |
43 # #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0') | |
44 # "none" | |
45 # #else | |
46 # "$input.metadata.dbkey" | |
47 # #end if | |
48 # | |
49 # #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style]) | |
50 # #if $draw_tree_options == '' | |
51 # "" | |
52 # #else | |
53 # "-$draw_tree_options" | |
54 # #end if | |
55 # | |
56 # #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) | |
57 # #set $arg = '%s:%s' % ($individual_col, $individual_name) | |
58 # "$arg" | |
59 # #end for | |
60 # </command> | |
61 | |
62 ################################################################################ | |
63 | |
64 # if len(sys.argv) < 11: | |
65 # print >> sys.stderr, "Usage" | |
66 # sys.exit(1) | |
67 # | |
68 # input, p1_input, output, extra_files_path, minimum_coverage, minimum_quality, dbkey, data_source, draw_tree_options = sys.argv[1:10] | |
69 # | |
70 # individual_metadata = sys.argv[10:] | |
71 # | |
72 # # note: TEST THIS | |
73 # if dbkey in ['', '?', 'None']: | |
74 # dbkey = 'none' | |
75 # | |
76 # p_total = Population() | |
77 # p_total.from_tag_list(individual_metadata) | |
78 | |
79 if len(sys.argv) < 5: | |
80 print >> sys.stderr, 'Usage' | |
81 sys.exit(1) | |
82 | |
83 input, output, extra_files_path, input_type = sys.argv[1:5] | |
84 args = sys.argv[5:] | |
85 | |
86 data_source = '1' | |
87 minimum_coverage = '0' | |
88 minimum_quality = '0' | |
89 | 15 |
90 if input_type == 'gd_snp': | 16 if input_type == 'gd_snp': |
91 data_source_arg = args.pop(0) | |
92 if data_source_arg == 'sequence_coverage': | 17 if data_source_arg == 'sequence_coverage': |
93 data_source = '0' | 18 data_source = 0 |
94 minimum_coverage = args.pop(0) | |
95 minimum_quality = args.pop(0) | |
96 elif data_source_arg == 'estimated_genotype': | 19 elif data_source_arg == 'estimated_genotype': |
97 pass | 20 data_source = 1 |
98 else: | 21 else: |
99 print >> sys.stderr, 'Unsupported data_source:', data_source_arg | 22 gd_util.die('Unsupported data_source: {0}'.format(data_source_arg)) |
100 sys.exit(1) | |
101 elif input_type == 'gd_genotype': | 23 elif input_type == 'gd_genotype': |
102 pass | 24 data_source = 1 |
25 minimum_coverage = 0 | |
26 minimum_quality = 0 | |
103 else: | 27 else: |
104 print >> sys.stderr, 'Unsupported input_type:', input_type | 28 gd_util.die('Unsupported input_type:: {0}'.format(input_type)) |
105 sys.exit(1) | |
106 | |
107 p1_input, dbkey, draw_tree_options = args[:3] | |
108 | 29 |
109 # note: TEST THIS | 30 # note: TEST THIS |
110 if dbkey in ['', '?', 'None']: | 31 if dbkey in ['', '?', 'None']: |
111 dbkey = 'none' | 32 dbkey = 'none' |
112 | 33 |
113 individual_metadata = args[3:] | 34 p_total = Population() |
35 p_total.from_wrapped_dict(ind_arg) | |
114 | 36 |
115 p_total = Population() | 37 if p1_input == "all_individuals": |
116 p_total.from_tag_list(individual_metadata) | 38 tags = p_total.tag_list() |
39 else: | |
40 p1 = Population() | |
41 p1.from_population_file(p1_input) | |
42 if not p_total.is_superset(p1): | |
43 gd_util.die('There is an individual in the population that is not in the SNP table') | |
44 tags = p1.tag_list() | |
117 | 45 |
118 ################################################################################ | 46 ################################################################################ |
119 | 47 |
120 mkdir_p(extra_files_path) | 48 gd_util.mkdir_p(extra_files_path) |
121 | |
122 ################################################################################ | |
123 | |
124 def run_program(prog, args, ofh): | |
125 #print "args: ", ' '.join(args) | |
126 p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
127 (stdoutdata, stderrdata) = p.communicate() | |
128 rc = p.returncode | |
129 ofh.close() | |
130 | |
131 if rc != 0: | |
132 #print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
133 print >> sys.stderr, stderrdata | |
134 sys.exit(1) | |
135 | |
136 ################################################################################ | |
137 | |
138 phylip_outfile = os.path.join(extra_files_path, 'distance_matrix.phylip') | 49 phylip_outfile = os.path.join(extra_files_path, 'distance_matrix.phylip') |
139 newick_outfile = os.path.join(extra_files_path, 'phylogenetic_tree.newick') | 50 newick_outfile = os.path.join(extra_files_path, 'phylogenetic_tree.newick') |
140 ps_outfile = 'tree.ps' | 51 ps_outfile = 'tree.ps' |
141 pdf_outfile = os.path.join(extra_files_path, 'tree.pdf') | 52 pdf_outfile = os.path.join(extra_files_path, 'tree.pdf') |
53 informative_snp_file = os.path.join(extra_files_path, 'informative_snps.txt') | |
54 mega_distance_matrix_file = os.path.join(extra_files_path, 'mega_distance_matrix.txt') | |
142 | 55 |
143 ################################################################################ | 56 ################################################################################ |
144 | 57 |
145 informative_snp_file = os.path.join(extra_files_path, 'informative_snps.txt') | |
146 mega_distance_matrix_file = os.path.join(extra_files_path, 'mega_distance_matrix.txt') | |
147 | |
148 prog = 'dist_mat' | 58 prog = 'dist_mat' |
149 | 59 |
150 args = [] | 60 args = [ prog ] |
151 args.append(prog) | |
152 args.append(input) | 61 args.append(input) |
153 args.append(minimum_coverage) | 62 args.append(minimum_coverage) |
154 args.append(minimum_quality) | 63 args.append(minimum_quality) |
155 args.append(dbkey) | 64 args.append(dbkey) |
156 args.append(data_source) | 65 args.append(data_source) |
157 args.append(informative_snp_file) | 66 args.append(informative_snp_file) |
158 args.append(mega_distance_matrix_file) | 67 args.append(mega_distance_matrix_file) |
159 | 68 |
160 if p1_input == "all_individuals": | |
161 tags = p_total.tag_list() | |
162 else: | |
163 p1 = Population() | |
164 p1.from_population_file(p1_input) | |
165 if not p_total.is_superset(p1): | |
166 print >> sys.stderr, 'There is an individual in the population that is not in the SNP table' | |
167 sys.exit(1) | |
168 tags = p1.tag_list() | |
169 | |
170 for tag in tags: | 69 for tag in tags: |
171 if input_type == 'gd_genotype': | 70 if input_type == 'gd_genotype': |
172 column, name = tag.split(':') | 71 column, name = tag.split(':') |
173 tag = '{0}:{1}'.format(int(column) - 2, name) | 72 tag = '{0}:{1}'.format(int(column) - 2, name) |
174 args.append(tag) | 73 args.append(tag) |
175 | 74 |
176 fh = open(phylip_outfile, 'w') | 75 with open(phylip_outfile, 'w') as fh: |
177 run_program(None, args, fh) | 76 gd_util.run_program(prog, args, stdout=fh) |
178 | 77 |
179 ################################################################################ | 78 ################################################################################ |
180 | 79 |
181 prog = 'quicktree' | 80 prog = 'quicktree' |
182 | 81 |
183 args = [] | 82 args = [ prog ] |
184 args.append(prog) | |
185 args.append('-in') | 83 args.append('-in') |
186 args.append('m') | 84 args.append('m') |
187 args.append('-out') | 85 args.append('-out') |
188 args.append('t') | 86 args.append('t') |
189 args.append(phylip_outfile) | 87 args.append(phylip_outfile) |
190 | 88 |
191 fh = open(newick_outfile, 'w') | 89 with open(newick_outfile, 'w') as fh: |
192 run_program(None, args, fh) | 90 gd_util.run_program(prog, args, stdout=fh) |
193 | 91 |
194 ################################################################################ | 92 ################################################################################ |
195 | 93 |
196 prog = 'draw_tree' | 94 prog = 'draw_tree' |
197 | 95 |
198 args = [] | 96 args = [ prog ] |
199 args.append(prog) | 97 |
200 if draw_tree_options: | 98 if draw_tree_options: |
201 args.append(draw_tree_options) | 99 args.append(draw_tree_options) |
100 | |
202 args.append(newick_outfile) | 101 args.append(newick_outfile) |
203 | 102 |
204 fh = open(ps_outfile, 'w') | 103 with open(ps_outfile, 'w') as fh: |
205 run_program(None, args, fh) | 104 gd_util.run_program(prog, args, stdout=fh) |
206 | 105 |
207 ################################################################################ | 106 ################################################################################ |
208 | 107 |
209 prog = 'ps2pdf' | 108 prog = 'ps2pdf' |
210 | 109 |
211 args = [] | 110 args = [ prog ] |
212 args.append(prog) | |
213 args.append('-dPDFSETTINGS=/prepress') | 111 args.append('-dPDFSETTINGS=/prepress') |
214 args.append(ps_outfile) | 112 args.append(ps_outfile) |
215 args.append('-') | 113 args.append(pdf_outfile) |
216 | 114 |
217 fh = open(pdf_outfile, 'w') | 115 gd_util.run_program(prog, args) |
218 run_program(None, args, fh) | |
219 | |
220 shutil.copyfile(pdf_outfile, output) | |
221 | 116 |
222 ################################################################################ | 117 ################################################################################ |
223 | 118 |
224 info_page = gd_composite.InfoPage() | 119 info_page = gd_composite.InfoPage() |
225 info_page.set_title('Phylogenetic tree Galaxy Composite Dataset') | 120 info_page.set_title('Phylogenetic tree Galaxy Composite Dataset') |
246 if dbkey != 'none': | 141 if dbkey != 'none': |
247 include_ref_value = 'yes' | 142 include_ref_value = 'yes' |
248 | 143 |
249 in_include_ref = gd_composite.Parameter(description='Include reference sequence', value=include_ref_value, display_type=display_value) | 144 in_include_ref = gd_composite.Parameter(description='Include reference sequence', value=include_ref_value, display_type=display_value) |
250 | 145 |
251 if data_source == '0': | 146 if data_source == 0: |
252 data_source_value = 'sequence coverage' | 147 data_source_value = 'sequence coverage' |
253 elif data_source == '1': | 148 elif data_source == 1: |
254 data_source_value = 'estimated genotype' | 149 data_source_value = 'estimated genotype' |
255 | 150 |
256 in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value) | 151 in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value) |
257 | 152 |
258 branch_type_value = 'square' | 153 branch_type_value = 'square' |