Mercurial > repos > miller-lab > genome_diversity
annotate pca.py @ 25:cba0d7a63b82
workaround for gd_genotype datatype
admix shift int -> float
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 29 May 2013 13:49:19 -0400 |
parents | 248b06e86022 |
children | 8997f2ca8c7a |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
3 import errno | |
4 import os | |
5 import shutil | |
6 import subprocess | |
7 import sys | |
8 from BeautifulSoup import BeautifulSoup | |
9 import gd_composite | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
10 import re |
0 | 11 |
12 ################################################################################ | |
13 | |
14 def mkdir_p(path): | |
15 try: | |
16 os.makedirs(path) | |
17 except OSError, e: | |
18 if e.errno <> errno.EEXIST: | |
19 raise | |
20 | |
21 ################################################################################ | |
22 | |
23 def run_program(prog, args, stdout_file=None): | |
24 #print "args: ", ' '.join(args) | |
25 p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
26 (stdoutdata, stderrdata) = p.communicate() | |
27 rc = p.returncode | |
28 | |
29 if stdout_file is not None: | |
30 with open(stdout_file, 'w') as ofh: | |
31 print >> ofh, stdoutdata | |
32 | |
33 if rc != 0: | |
34 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
35 print >> sys.stderr, stderrdata | |
36 sys.exit(1) | |
37 | |
38 ################################################################################ | |
39 | |
40 def do_ped2geno(input, output): | |
41 lines = [] | |
42 with open(input) as fh: | |
43 for line in fh: | |
44 line = line.rstrip('\r\n') | |
45 lines.append(line.split()) | |
46 | |
47 pair_map = { | |
48 '0':{ '0':'9', '1':'9', '2':'9' }, | |
49 '1':{ '0':'1', '1':'2', '2':'1' }, | |
50 '2':{ '0':'1', '1':'1', '2':'0' } | |
51 } | |
52 with open(output, 'w') as ofh: | |
53 for a_idx in xrange(6, len(lines[0]), 2): | |
54 b_idx = a_idx + 1 | |
55 print >> ofh, ''.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines)) | |
56 | |
57 def do_map2snp(input, output): | |
58 with open(output, 'w') as ofh: | |
59 with open(input) as fh: | |
60 for line in fh: | |
61 elems = line.split() | |
62 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) | |
63 | |
64 def make_ind_file(ind_file, input): | |
65 pops = [] | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
66 name_map = [] |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
67 name_idx = 0 |
0 | 68 |
69 ofh = open(ind_file, 'w') | |
70 | |
71 with open(input) as fh: | |
72 soup = BeautifulSoup(fh) | |
73 misc = soup.find('div', {'id': 'gd_misc'}) | |
74 populations = misc('ul')[0] | |
75 | |
76 i = 0 | |
77 for entry in populations: | |
78 if i % 2 == 1: | |
79 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') | |
80 pops.append(population_name) | |
81 individuals = entry.ol('li') | |
82 for individual in individuals: | |
83 individual_name = individual.string.encode('utf8').strip() | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
84 name_map.append(individual_name) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
85 print >> ofh, 'ind_%s' % name_idx, 'M', population_name |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
86 name_idx += 1 |
0 | 87 i += 1 |
88 | |
89 ofh.close() | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
90 return pops, name_map |
0 | 91 |
92 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): | |
93 with open(par_file, 'w') as fh: | |
94 print >> fh, 'genotypename: {0}'.format(geno_file) | |
95 print >> fh, 'snpname: {0}'.format(snp_file) | |
96 print >> fh, 'indivname: {0}'.format(ind_file) | |
97 print >> fh, 'evecoutname: {0}'.format(evec_file) | |
98 print >> fh, 'evaloutname: {0}'.format(eval_file) | |
99 print >> fh, 'altnormstyle: NO' | |
100 print >> fh, 'numoutevec: 2' | |
101 | |
102 def do_smartpca(par_file): | |
103 prog = 'smartpca' | |
104 | |
105 args = [ prog ] | |
106 args.append('-p') | |
107 args.append(par_file) | |
108 | |
109 #print "args: ", ' '.join(args) | |
110 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr) | |
111 (stdoutdata, stderrdata) = p.communicate() | |
112 rc = p.returncode | |
113 | |
114 if rc != 0: | |
115 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
116 print >> sys.stderr, stderrdata | |
117 sys.exit(1) | |
118 | |
119 stats = [] | |
120 | |
121 save_line = False | |
122 for line in stdoutdata.split('\n'): | |
123 if line.startswith(('## Average divergence', '## Anova statistics', '## Statistical significance')): | |
124 stats.append('') | |
125 save_line = True | |
126 if line.strip() == '': | |
127 save_line = False | |
128 if save_line: | |
129 stats.append(line) | |
130 | |
131 return '\n'.join(stats[1:]) | |
132 | |
133 def do_ploteig(evec_file, population_names): | |
134 prog = 'gd_ploteig' | |
135 | |
136 args = [ prog ] | |
137 args.append('-i') | |
138 args.append(evec_file) | |
139 args.append('-c') | |
140 args.append('1:2') | |
141 args.append('-p') | |
142 args.append(':'.join(population_names)) | |
143 args.append('-x') | |
144 | |
145 run_program(None, args) | |
146 | |
147 def do_eval2pct(eval_file, explained_file): | |
148 prog = 'eval2pct' | |
149 | |
150 args = [ prog ] | |
151 args.append(eval_file) | |
152 | |
153 with open(explained_file, 'w') as ofh: | |
154 #print "args:", ' '.join(args) | |
155 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
156 (stdoutdata, stderrdata) = p.communicate() | |
157 rc = p.returncode | |
158 | |
159 if rc != 0: | |
160 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
161 print >> sys.stderr, stderrdata | |
162 sys.exit(1) | |
163 | |
164 def do_coords2admix(coords_file): | |
165 prog = 'coords2admix' | |
166 | |
167 args = [ prog ] | |
168 args.append(coords_file) | |
169 | |
170 with open('fake', 'w') as ofh: | |
171 #print "args:", ' '.join(args) | |
172 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
173 (stdoutdata, stderrdata) = p.communicate() | |
174 rc = p.returncode | |
175 | |
176 if rc != 0: | |
177 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
178 print >> sys.stderr, stderrdata | |
179 sys.exit(1) | |
180 | |
181 shutil.copy2('fake', coords_file) | |
182 | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
183 ind_regex = re.compile('ind_([0-9]+)') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
184 |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
185 def fix_names(name_map, files): |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
186 for file in files: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
187 tmp_filename = '%s.tmp' % file |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
188 with open(tmp_filename, 'w') as ofh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
189 with open(file) as fh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
190 for line in fh: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
191 line = line.rstrip('\r\n') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
192 match = ind_regex.search(line) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
193 if match: |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
194 idx = int(match.group(1)) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
195 old = 'ind_%s' % idx |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
196 new = name_map[idx].replace(' ', '_') |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
197 line = line.replace(old, new) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
198 print >> ofh, line |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
199 |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
200 shutil.copy2(tmp_filename, file) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
201 os.unlink(tmp_filename) |
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
202 |
0 | 203 ################################################################################ |
204 | |
205 if len(sys.argv) != 5: | |
206 print "usage" | |
207 sys.exit(1) | |
208 | |
209 input, input_files_path, output, output_files_path = sys.argv[1:5] | |
210 | |
211 mkdir_p(output_files_path) | |
212 | |
213 ped_file = os.path.join(input_files_path, 'admix.ped') | |
214 geno_file = os.path.join(output_files_path, 'admix.geno') | |
215 do_ped2geno(ped_file, geno_file) | |
216 | |
217 map_file = os.path.join(input_files_path, 'admix.map') | |
218 snp_file = os.path.join(output_files_path, 'admix.snp') | |
219 do_map2snp(map_file, snp_file) | |
220 | |
221 ind_file = os.path.join(output_files_path, 'admix.ind') | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
222 population_names, name_map = make_ind_file(ind_file, input) |
0 | 223 |
224 par_file = os.path.join(output_files_path, 'par.admix') | |
225 evec_file = os.path.join(output_files_path, 'coordinates.txt') | |
226 eval_file = os.path.join(output_files_path, 'admix.eval') | |
227 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) | |
228 | |
229 smartpca_stats = do_smartpca(par_file) | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
230 fix_names(name_map, [ind_file, evec_file]) |
0 | 231 |
232 do_ploteig(evec_file, population_names) | |
233 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) | |
234 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') | |
235 shutil.copy2(plot_file, output_plot_file) | |
236 os.unlink(plot_file) | |
237 | |
238 do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt')) | |
239 os.unlink(eval_file) | |
240 | |
241 do_coords2admix(evec_file) | |
242 | |
243 ################################################################################ | |
244 | |
245 info_page = gd_composite.InfoPage() | |
246 info_page.set_title('PCA Galaxy Composite Dataset') | |
247 | |
248 display_file = gd_composite.DisplayFile() | |
249 display_value = gd_composite.DisplayValue() | |
250 | |
251 out_pdf = gd_composite.Parameter(name='PCA.pdf', value='PCA.pdf', display_type=display_file) | |
252 out_evec = gd_composite.Parameter(name='coordinates.txt', value='coordinates.txt', display_type=display_file) | |
253 out_explained = gd_composite.Parameter(name='explained.txt', value='explained.txt', display_type=display_file) | |
254 | |
255 evec_prefix = 'coordinates.txt.1:2.{0}'.format(':'.join(population_names)) | |
256 ps_file = '{0}.ps'.format(evec_prefix) | |
257 xtxt_file = '{0}.xtxt'.format(evec_prefix) | |
258 | |
259 os.unlink(os.path.join(output_files_path, ps_file)) | |
260 os.unlink(os.path.join(output_files_path, xtxt_file)) | |
261 | |
262 info_page.add_output_parameter(out_pdf) | |
263 info_page.add_output_parameter(out_evec) | |
264 info_page.add_output_parameter(out_explained) | |
265 | |
266 in_admix = gd_composite.Parameter(name='par.admix', value='par.admix', display_type=display_file) | |
267 in_geno = gd_composite.Parameter(name='admix.geno', value='admix.geno', display_type=display_file) | |
268 in_snp = gd_composite.Parameter(name='admix.snp', value='admix.snp', display_type=display_file) | |
269 in_ind = gd_composite.Parameter(name='admix.ind', value='admix.ind', display_type=display_file) | |
270 | |
271 info_page.add_input_parameter(in_admix) | |
272 info_page.add_input_parameter(in_geno) | |
273 info_page.add_input_parameter(in_snp) | |
274 info_page.add_input_parameter(in_ind) | |
275 | |
276 misc_stats = gd_composite.Parameter(description='Stats<p/><pre>\n{0}\n</pre>'.format(smartpca_stats), display_type=display_value) | |
277 | |
278 info_page.add_misc(misc_stats) | |
279 | |
280 with open (output, 'w') as ofh: | |
281 print >> ofh, info_page.render() | |
282 | |
283 sys.exit(0) | |
284 |