Mercurial > repos > miller-lab > genome_diversity
annotate pca.py @ 25:cba0d7a63b82
workaround for gd_genotype datatype
admix shift int -> float
| author | Richard Burhans <burhans@bx.psu.edu> |
|---|---|
| date | Wed, 29 May 2013 13:49:19 -0400 |
| parents | 248b06e86022 |
| children | 8997f2ca8c7a |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 | |
| 3 import errno | |
| 4 import os | |
| 5 import shutil | |
| 6 import subprocess | |
| 7 import sys | |
| 8 from BeautifulSoup import BeautifulSoup | |
| 9 import gd_composite | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
10 import re |
| 0 | 11 |
| 12 ################################################################################ | |
| 13 | |
| 14 def mkdir_p(path): | |
| 15 try: | |
| 16 os.makedirs(path) | |
| 17 except OSError, e: | |
| 18 if e.errno <> errno.EEXIST: | |
| 19 raise | |
| 20 | |
| 21 ################################################################################ | |
| 22 | |
| 23 def run_program(prog, args, stdout_file=None): | |
| 24 #print "args: ", ' '.join(args) | |
| 25 p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| 26 (stdoutdata, stderrdata) = p.communicate() | |
| 27 rc = p.returncode | |
| 28 | |
| 29 if stdout_file is not None: | |
| 30 with open(stdout_file, 'w') as ofh: | |
| 31 print >> ofh, stdoutdata | |
| 32 | |
| 33 if rc != 0: | |
| 34 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 35 print >> sys.stderr, stderrdata | |
| 36 sys.exit(1) | |
| 37 | |
| 38 ################################################################################ | |
| 39 | |
| 40 def do_ped2geno(input, output): | |
| 41 lines = [] | |
| 42 with open(input) as fh: | |
| 43 for line in fh: | |
| 44 line = line.rstrip('\r\n') | |
| 45 lines.append(line.split()) | |
| 46 | |
| 47 pair_map = { | |
| 48 '0':{ '0':'9', '1':'9', '2':'9' }, | |
| 49 '1':{ '0':'1', '1':'2', '2':'1' }, | |
| 50 '2':{ '0':'1', '1':'1', '2':'0' } | |
| 51 } | |
| 52 with open(output, 'w') as ofh: | |
| 53 for a_idx in xrange(6, len(lines[0]), 2): | |
| 54 b_idx = a_idx + 1 | |
| 55 print >> ofh, ''.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines)) | |
| 56 | |
| 57 def do_map2snp(input, output): | |
| 58 with open(output, 'w') as ofh: | |
| 59 with open(input) as fh: | |
| 60 for line in fh: | |
| 61 elems = line.split() | |
| 62 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) | |
| 63 | |
| 64 def make_ind_file(ind_file, input): | |
| 65 pops = [] | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
66 name_map = [] |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
67 name_idx = 0 |
| 0 | 68 |
| 69 ofh = open(ind_file, 'w') | |
| 70 | |
| 71 with open(input) as fh: | |
| 72 soup = BeautifulSoup(fh) | |
| 73 misc = soup.find('div', {'id': 'gd_misc'}) | |
| 74 populations = misc('ul')[0] | |
| 75 | |
| 76 i = 0 | |
| 77 for entry in populations: | |
| 78 if i % 2 == 1: | |
| 79 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') | |
| 80 pops.append(population_name) | |
| 81 individuals = entry.ol('li') | |
| 82 for individual in individuals: | |
| 83 individual_name = individual.string.encode('utf8').strip() | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
84 name_map.append(individual_name) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
85 print >> ofh, 'ind_%s' % name_idx, 'M', population_name |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
86 name_idx += 1 |
| 0 | 87 i += 1 |
| 88 | |
| 89 ofh.close() | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
90 return pops, name_map |
| 0 | 91 |
| 92 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): | |
| 93 with open(par_file, 'w') as fh: | |
| 94 print >> fh, 'genotypename: {0}'.format(geno_file) | |
| 95 print >> fh, 'snpname: {0}'.format(snp_file) | |
| 96 print >> fh, 'indivname: {0}'.format(ind_file) | |
| 97 print >> fh, 'evecoutname: {0}'.format(evec_file) | |
| 98 print >> fh, 'evaloutname: {0}'.format(eval_file) | |
| 99 print >> fh, 'altnormstyle: NO' | |
| 100 print >> fh, 'numoutevec: 2' | |
| 101 | |
| 102 def do_smartpca(par_file): | |
| 103 prog = 'smartpca' | |
| 104 | |
| 105 args = [ prog ] | |
| 106 args.append('-p') | |
| 107 args.append(par_file) | |
| 108 | |
| 109 #print "args: ", ' '.join(args) | |
| 110 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr) | |
| 111 (stdoutdata, stderrdata) = p.communicate() | |
| 112 rc = p.returncode | |
| 113 | |
| 114 if rc != 0: | |
| 115 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 116 print >> sys.stderr, stderrdata | |
| 117 sys.exit(1) | |
| 118 | |
| 119 stats = [] | |
| 120 | |
| 121 save_line = False | |
| 122 for line in stdoutdata.split('\n'): | |
| 123 if line.startswith(('## Average divergence', '## Anova statistics', '## Statistical significance')): | |
| 124 stats.append('') | |
| 125 save_line = True | |
| 126 if line.strip() == '': | |
| 127 save_line = False | |
| 128 if save_line: | |
| 129 stats.append(line) | |
| 130 | |
| 131 return '\n'.join(stats[1:]) | |
| 132 | |
| 133 def do_ploteig(evec_file, population_names): | |
| 134 prog = 'gd_ploteig' | |
| 135 | |
| 136 args = [ prog ] | |
| 137 args.append('-i') | |
| 138 args.append(evec_file) | |
| 139 args.append('-c') | |
| 140 args.append('1:2') | |
| 141 args.append('-p') | |
| 142 args.append(':'.join(population_names)) | |
| 143 args.append('-x') | |
| 144 | |
| 145 run_program(None, args) | |
| 146 | |
| 147 def do_eval2pct(eval_file, explained_file): | |
| 148 prog = 'eval2pct' | |
| 149 | |
| 150 args = [ prog ] | |
| 151 args.append(eval_file) | |
| 152 | |
| 153 with open(explained_file, 'w') as ofh: | |
| 154 #print "args:", ' '.join(args) | |
| 155 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
| 156 (stdoutdata, stderrdata) = p.communicate() | |
| 157 rc = p.returncode | |
| 158 | |
| 159 if rc != 0: | |
| 160 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 161 print >> sys.stderr, stderrdata | |
| 162 sys.exit(1) | |
| 163 | |
| 164 def do_coords2admix(coords_file): | |
| 165 prog = 'coords2admix' | |
| 166 | |
| 167 args = [ prog ] | |
| 168 args.append(coords_file) | |
| 169 | |
| 170 with open('fake', 'w') as ofh: | |
| 171 #print "args:", ' '.join(args) | |
| 172 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
| 173 (stdoutdata, stderrdata) = p.communicate() | |
| 174 rc = p.returncode | |
| 175 | |
| 176 if rc != 0: | |
| 177 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 178 print >> sys.stderr, stderrdata | |
| 179 sys.exit(1) | |
| 180 | |
| 181 shutil.copy2('fake', coords_file) | |
| 182 | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
183 ind_regex = re.compile('ind_([0-9]+)') |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
184 |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
185 def fix_names(name_map, files): |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
186 for file in files: |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
187 tmp_filename = '%s.tmp' % file |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
188 with open(tmp_filename, 'w') as ofh: |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
189 with open(file) as fh: |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
190 for line in fh: |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
191 line = line.rstrip('\r\n') |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
192 match = ind_regex.search(line) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
193 if match: |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
194 idx = int(match.group(1)) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
195 old = 'ind_%s' % idx |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
196 new = name_map[idx].replace(' ', '_') |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
197 line = line.replace(old, new) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
198 print >> ofh, line |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
199 |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
200 shutil.copy2(tmp_filename, file) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
201 os.unlink(tmp_filename) |
|
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
202 |
| 0 | 203 ################################################################################ |
| 204 | |
| 205 if len(sys.argv) != 5: | |
| 206 print "usage" | |
| 207 sys.exit(1) | |
| 208 | |
| 209 input, input_files_path, output, output_files_path = sys.argv[1:5] | |
| 210 | |
| 211 mkdir_p(output_files_path) | |
| 212 | |
| 213 ped_file = os.path.join(input_files_path, 'admix.ped') | |
| 214 geno_file = os.path.join(output_files_path, 'admix.geno') | |
| 215 do_ped2geno(ped_file, geno_file) | |
| 216 | |
| 217 map_file = os.path.join(input_files_path, 'admix.map') | |
| 218 snp_file = os.path.join(output_files_path, 'admix.snp') | |
| 219 do_map2snp(map_file, snp_file) | |
| 220 | |
| 221 ind_file = os.path.join(output_files_path, 'admix.ind') | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
222 population_names, name_map = make_ind_file(ind_file, input) |
| 0 | 223 |
| 224 par_file = os.path.join(output_files_path, 'par.admix') | |
| 225 evec_file = os.path.join(output_files_path, 'coordinates.txt') | |
| 226 eval_file = os.path.join(output_files_path, 'admix.eval') | |
| 227 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) | |
| 228 | |
| 229 smartpca_stats = do_smartpca(par_file) | |
|
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
230 fix_names(name_map, [ind_file, evec_file]) |
| 0 | 231 |
| 232 do_ploteig(evec_file, population_names) | |
| 233 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) | |
| 234 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') | |
| 235 shutil.copy2(plot_file, output_plot_file) | |
| 236 os.unlink(plot_file) | |
| 237 | |
| 238 do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt')) | |
| 239 os.unlink(eval_file) | |
| 240 | |
| 241 do_coords2admix(evec_file) | |
| 242 | |
| 243 ################################################################################ | |
| 244 | |
| 245 info_page = gd_composite.InfoPage() | |
| 246 info_page.set_title('PCA Galaxy Composite Dataset') | |
| 247 | |
| 248 display_file = gd_composite.DisplayFile() | |
| 249 display_value = gd_composite.DisplayValue() | |
| 250 | |
| 251 out_pdf = gd_composite.Parameter(name='PCA.pdf', value='PCA.pdf', display_type=display_file) | |
| 252 out_evec = gd_composite.Parameter(name='coordinates.txt', value='coordinates.txt', display_type=display_file) | |
| 253 out_explained = gd_composite.Parameter(name='explained.txt', value='explained.txt', display_type=display_file) | |
| 254 | |
| 255 evec_prefix = 'coordinates.txt.1:2.{0}'.format(':'.join(population_names)) | |
| 256 ps_file = '{0}.ps'.format(evec_prefix) | |
| 257 xtxt_file = '{0}.xtxt'.format(evec_prefix) | |
| 258 | |
| 259 os.unlink(os.path.join(output_files_path, ps_file)) | |
| 260 os.unlink(os.path.join(output_files_path, xtxt_file)) | |
| 261 | |
| 262 info_page.add_output_parameter(out_pdf) | |
| 263 info_page.add_output_parameter(out_evec) | |
| 264 info_page.add_output_parameter(out_explained) | |
| 265 | |
| 266 in_admix = gd_composite.Parameter(name='par.admix', value='par.admix', display_type=display_file) | |
| 267 in_geno = gd_composite.Parameter(name='admix.geno', value='admix.geno', display_type=display_file) | |
| 268 in_snp = gd_composite.Parameter(name='admix.snp', value='admix.snp', display_type=display_file) | |
| 269 in_ind = gd_composite.Parameter(name='admix.ind', value='admix.ind', display_type=display_file) | |
| 270 | |
| 271 info_page.add_input_parameter(in_admix) | |
| 272 info_page.add_input_parameter(in_geno) | |
| 273 info_page.add_input_parameter(in_snp) | |
| 274 info_page.add_input_parameter(in_ind) | |
| 275 | |
| 276 misc_stats = gd_composite.Parameter(description='Stats<p/><pre>\n{0}\n</pre>'.format(smartpca_stats), display_type=display_value) | |
| 277 | |
| 278 info_page.add_misc(misc_stats) | |
| 279 | |
| 280 with open (output, 'w') as ofh: | |
| 281 print >> ofh, info_page.render() | |
| 282 | |
| 283 sys.exit(0) | |
| 284 |
