Mercurial > repos > miller-lab > genome_diversity
diff pca.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | 2c498d40ecde |
children | 8997f2ca8c7a |
line wrap: on
line diff
--- a/pca.py Wed May 22 15:58:18 2013 -0400 +++ b/pca.py Tue May 28 16:24:19 2013 -0400 @@ -7,6 +7,7 @@ import sys from BeautifulSoup import BeautifulSoup import gd_composite +import re ################################################################################ @@ -62,6 +63,8 @@ def make_ind_file(ind_file, input): pops = [] + name_map = [] + name_idx = 0 ofh = open(ind_file, 'w') @@ -78,11 +81,13 @@ individuals = entry.ol('li') for individual in individuals: individual_name = individual.string.encode('utf8').strip() - print >> ofh, individual_name, 'M', population_name + name_map.append(individual_name) + print >> ofh, 'ind_%s' % name_idx, 'M', population_name + name_idx += 1 i += 1 ofh.close() - return pops + return pops, name_map def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): with open(par_file, 'w') as fh: @@ -175,6 +180,26 @@ shutil.copy2('fake', coords_file) +ind_regex = re.compile('ind_([0-9]+)') + +def fix_names(name_map, files): + for file in files: + tmp_filename = '%s.tmp' % file + with open(tmp_filename, 'w') as ofh: + with open(file) as fh: + for line in fh: + line = line.rstrip('\r\n') + match = ind_regex.search(line) + if match: + idx = int(match.group(1)) + old = 'ind_%s' % idx + new = name_map[idx].replace(' ', '_') + line = line.replace(old, new) + print >> ofh, line + + shutil.copy2(tmp_filename, file) + os.unlink(tmp_filename) + ################################################################################ if len(sys.argv) != 5: @@ -194,7 +219,7 @@ do_map2snp(map_file, snp_file) ind_file = os.path.join(output_files_path, 'admix.ind') -population_names = make_ind_file(ind_file, input) +population_names, name_map = make_ind_file(ind_file, input) par_file = os.path.join(output_files_path, 'par.admix') evec_file = os.path.join(output_files_path, 'coordinates.txt') @@ -202,6 +227,7 @@ make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) smartpca_stats = do_smartpca(par_file) +fix_names(name_map, [ind_file, evec_file]) do_ploteig(evec_file, population_names) plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))