Mercurial > repos > miller-lab > genome_diversity
comparison pca.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | 2c498d40ecde |
children | 8997f2ca8c7a |
comparison
equal
deleted
inserted
replaced
23:66a183c44dd5 | 24:248b06e86022 |
---|---|
5 import shutil | 5 import shutil |
6 import subprocess | 6 import subprocess |
7 import sys | 7 import sys |
8 from BeautifulSoup import BeautifulSoup | 8 from BeautifulSoup import BeautifulSoup |
9 import gd_composite | 9 import gd_composite |
10 import re | |
10 | 11 |
11 ################################################################################ | 12 ################################################################################ |
12 | 13 |
13 def mkdir_p(path): | 14 def mkdir_p(path): |
14 try: | 15 try: |
60 elems = line.split() | 61 elems = line.split() |
61 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) | 62 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) |
62 | 63 |
63 def make_ind_file(ind_file, input): | 64 def make_ind_file(ind_file, input): |
64 pops = [] | 65 pops = [] |
66 name_map = [] | |
67 name_idx = 0 | |
65 | 68 |
66 ofh = open(ind_file, 'w') | 69 ofh = open(ind_file, 'w') |
67 | 70 |
68 with open(input) as fh: | 71 with open(input) as fh: |
69 soup = BeautifulSoup(fh) | 72 soup = BeautifulSoup(fh) |
76 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') | 79 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') |
77 pops.append(population_name) | 80 pops.append(population_name) |
78 individuals = entry.ol('li') | 81 individuals = entry.ol('li') |
79 for individual in individuals: | 82 for individual in individuals: |
80 individual_name = individual.string.encode('utf8').strip() | 83 individual_name = individual.string.encode('utf8').strip() |
81 print >> ofh, individual_name, 'M', population_name | 84 name_map.append(individual_name) |
85 print >> ofh, 'ind_%s' % name_idx, 'M', population_name | |
86 name_idx += 1 | |
82 i += 1 | 87 i += 1 |
83 | 88 |
84 ofh.close() | 89 ofh.close() |
85 return pops | 90 return pops, name_map |
86 | 91 |
87 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): | 92 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): |
88 with open(par_file, 'w') as fh: | 93 with open(par_file, 'w') as fh: |
89 print >> fh, 'genotypename: {0}'.format(geno_file) | 94 print >> fh, 'genotypename: {0}'.format(geno_file) |
90 print >> fh, 'snpname: {0}'.format(snp_file) | 95 print >> fh, 'snpname: {0}'.format(snp_file) |
173 print >> sys.stderr, stderrdata | 178 print >> sys.stderr, stderrdata |
174 sys.exit(1) | 179 sys.exit(1) |
175 | 180 |
176 shutil.copy2('fake', coords_file) | 181 shutil.copy2('fake', coords_file) |
177 | 182 |
183 ind_regex = re.compile('ind_([0-9]+)') | |
184 | |
185 def fix_names(name_map, files): | |
186 for file in files: | |
187 tmp_filename = '%s.tmp' % file | |
188 with open(tmp_filename, 'w') as ofh: | |
189 with open(file) as fh: | |
190 for line in fh: | |
191 line = line.rstrip('\r\n') | |
192 match = ind_regex.search(line) | |
193 if match: | |
194 idx = int(match.group(1)) | |
195 old = 'ind_%s' % idx | |
196 new = name_map[idx].replace(' ', '_') | |
197 line = line.replace(old, new) | |
198 print >> ofh, line | |
199 | |
200 shutil.copy2(tmp_filename, file) | |
201 os.unlink(tmp_filename) | |
202 | |
178 ################################################################################ | 203 ################################################################################ |
179 | 204 |
180 if len(sys.argv) != 5: | 205 if len(sys.argv) != 5: |
181 print "usage" | 206 print "usage" |
182 sys.exit(1) | 207 sys.exit(1) |
192 map_file = os.path.join(input_files_path, 'admix.map') | 217 map_file = os.path.join(input_files_path, 'admix.map') |
193 snp_file = os.path.join(output_files_path, 'admix.snp') | 218 snp_file = os.path.join(output_files_path, 'admix.snp') |
194 do_map2snp(map_file, snp_file) | 219 do_map2snp(map_file, snp_file) |
195 | 220 |
196 ind_file = os.path.join(output_files_path, 'admix.ind') | 221 ind_file = os.path.join(output_files_path, 'admix.ind') |
197 population_names = make_ind_file(ind_file, input) | 222 population_names, name_map = make_ind_file(ind_file, input) |
198 | 223 |
199 par_file = os.path.join(output_files_path, 'par.admix') | 224 par_file = os.path.join(output_files_path, 'par.admix') |
200 evec_file = os.path.join(output_files_path, 'coordinates.txt') | 225 evec_file = os.path.join(output_files_path, 'coordinates.txt') |
201 eval_file = os.path.join(output_files_path, 'admix.eval') | 226 eval_file = os.path.join(output_files_path, 'admix.eval') |
202 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) | 227 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) |
203 | 228 |
204 smartpca_stats = do_smartpca(par_file) | 229 smartpca_stats = do_smartpca(par_file) |
230 fix_names(name_map, [ind_file, evec_file]) | |
205 | 231 |
206 do_ploteig(evec_file, population_names) | 232 do_ploteig(evec_file, population_names) |
207 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) | 233 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) |
208 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') | 234 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') |
209 shutil.copy2(plot_file, output_plot_file) | 235 shutil.copy2(plot_file, output_plot_file) |