comparison pca.py @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 2c498d40ecde
children 8997f2ca8c7a
comparison
equal deleted inserted replaced
23:66a183c44dd5 24:248b06e86022
5 import shutil 5 import shutil
6 import subprocess 6 import subprocess
7 import sys 7 import sys
8 from BeautifulSoup import BeautifulSoup 8 from BeautifulSoup import BeautifulSoup
9 import gd_composite 9 import gd_composite
10 import re
10 11
11 ################################################################################ 12 ################################################################################
12 13
13 def mkdir_p(path): 14 def mkdir_p(path):
14 try: 15 try:
60 elems = line.split() 61 elems = line.split()
61 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) 62 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1])
62 63
63 def make_ind_file(ind_file, input): 64 def make_ind_file(ind_file, input):
64 pops = [] 65 pops = []
66 name_map = []
67 name_idx = 0
65 68
66 ofh = open(ind_file, 'w') 69 ofh = open(ind_file, 'w')
67 70
68 with open(input) as fh: 71 with open(input) as fh:
69 soup = BeautifulSoup(fh) 72 soup = BeautifulSoup(fh)
76 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') 79 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_')
77 pops.append(population_name) 80 pops.append(population_name)
78 individuals = entry.ol('li') 81 individuals = entry.ol('li')
79 for individual in individuals: 82 for individual in individuals:
80 individual_name = individual.string.encode('utf8').strip() 83 individual_name = individual.string.encode('utf8').strip()
81 print >> ofh, individual_name, 'M', population_name 84 name_map.append(individual_name)
85 print >> ofh, 'ind_%s' % name_idx, 'M', population_name
86 name_idx += 1
82 i += 1 87 i += 1
83 88
84 ofh.close() 89 ofh.close()
85 return pops 90 return pops, name_map
86 91
87 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): 92 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file):
88 with open(par_file, 'w') as fh: 93 with open(par_file, 'w') as fh:
89 print >> fh, 'genotypename: {0}'.format(geno_file) 94 print >> fh, 'genotypename: {0}'.format(geno_file)
90 print >> fh, 'snpname: {0}'.format(snp_file) 95 print >> fh, 'snpname: {0}'.format(snp_file)
173 print >> sys.stderr, stderrdata 178 print >> sys.stderr, stderrdata
174 sys.exit(1) 179 sys.exit(1)
175 180
176 shutil.copy2('fake', coords_file) 181 shutil.copy2('fake', coords_file)
177 182
183 ind_regex = re.compile('ind_([0-9]+)')
184
185 def fix_names(name_map, files):
186 for file in files:
187 tmp_filename = '%s.tmp' % file
188 with open(tmp_filename, 'w') as ofh:
189 with open(file) as fh:
190 for line in fh:
191 line = line.rstrip('\r\n')
192 match = ind_regex.search(line)
193 if match:
194 idx = int(match.group(1))
195 old = 'ind_%s' % idx
196 new = name_map[idx].replace(' ', '_')
197 line = line.replace(old, new)
198 print >> ofh, line
199
200 shutil.copy2(tmp_filename, file)
201 os.unlink(tmp_filename)
202
178 ################################################################################ 203 ################################################################################
179 204
180 if len(sys.argv) != 5: 205 if len(sys.argv) != 5:
181 print "usage" 206 print "usage"
182 sys.exit(1) 207 sys.exit(1)
192 map_file = os.path.join(input_files_path, 'admix.map') 217 map_file = os.path.join(input_files_path, 'admix.map')
193 snp_file = os.path.join(output_files_path, 'admix.snp') 218 snp_file = os.path.join(output_files_path, 'admix.snp')
194 do_map2snp(map_file, snp_file) 219 do_map2snp(map_file, snp_file)
195 220
196 ind_file = os.path.join(output_files_path, 'admix.ind') 221 ind_file = os.path.join(output_files_path, 'admix.ind')
197 population_names = make_ind_file(ind_file, input) 222 population_names, name_map = make_ind_file(ind_file, input)
198 223
199 par_file = os.path.join(output_files_path, 'par.admix') 224 par_file = os.path.join(output_files_path, 'par.admix')
200 evec_file = os.path.join(output_files_path, 'coordinates.txt') 225 evec_file = os.path.join(output_files_path, 'coordinates.txt')
201 eval_file = os.path.join(output_files_path, 'admix.eval') 226 eval_file = os.path.join(output_files_path, 'admix.eval')
202 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) 227 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file)
203 228
204 smartpca_stats = do_smartpca(par_file) 229 smartpca_stats = do_smartpca(par_file)
230 fix_names(name_map, [ind_file, evec_file])
205 231
206 do_ploteig(evec_file, population_names) 232 do_ploteig(evec_file, population_names)
207 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) 233 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))
208 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') 234 output_plot_file = os.path.join(output_files_path, 'PCA.pdf')
209 shutil.copy2(plot_file, output_plot_file) 235 shutil.copy2(plot_file, output_plot_file)