genome_diversity: pca.py comparison

comparison pca.py @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.

author	Richard Burhans <burhans@bx.psu.edu>
date	Tue, 28 May 2013 16:24:19 -0400
parents	2c498d40ecde
children	8997f2ca8c7a

comparison

equal deleted inserted replaced

-:66a183c44dd5
+:248b06e86022
 import shutil
 import subprocess
 import sys
 from BeautifulSoup import BeautifulSoup
 import gd_composite
+import re
 ################################################################################
 def mkdir_p(path):
 try:
 elems = line.split()
 print >> ofh, '  {0} 11 0.002 2000 A T'.format(elems[1])
 def make_ind_file(ind_file, input):
 pops = []
+name_map = []
+name_idx = 0
 ofh = open(ind_file, 'w')
 with open(input) as fh:
 soup = BeautifulSoup(fh)
 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_')
 pops.append(population_name)
 individuals = entry.ol('li')
 for individual in individuals:
 individual_name = individual.string.encode('utf8').strip()
-print >> ofh, individual_name, 'M', population_name
+name_map.append(individual_name)
+print >> ofh, 'ind_%s' % name_idx, 'M', population_name
+name_idx += 1
 i += 1
 ofh.close()
-return pops
+return pops, name_map
 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file):
 with open(par_file, 'w') as fh:
 print >> fh, 'genotypename: {0}'.format(geno_file)
 print >> fh, 'snpname: {0}'.format(snp_file)
 print >> sys.stderr, stderrdata
 sys.exit(1)
 shutil.copy2('fake', coords_file)
+ind_regex = re.compile('ind_([0-9]+)')
+def fix_names(name_map, files):
+for file in files:
+tmp_filename = '%s.tmp' % file
+with open(tmp_filename, 'w') as ofh:
+with open(file) as fh:
+for line in fh:
+line = line.rstrip('\r\n')
+match = ind_regex.search(line)
+if match:
+idx = int(match.group(1))
+old = 'ind_%s' % idx
+new = name_map[idx].replace(' ', '_')
+line = line.replace(old, new)
+print >> ofh, line
+shutil.copy2(tmp_filename, file)
+os.unlink(tmp_filename)
 ################################################################################
 if len(sys.argv) != 5:
 print "usage"
 sys.exit(1)
 map_file = os.path.join(input_files_path, 'admix.map')
 snp_file = os.path.join(output_files_path, 'admix.snp')
 do_map2snp(map_file, snp_file)
 ind_file = os.path.join(output_files_path, 'admix.ind')
-population_names = make_ind_file(ind_file, input)
+population_names, name_map = make_ind_file(ind_file, input)
 par_file = os.path.join(output_files_path, 'par.admix')
 evec_file = os.path.join(output_files_path, 'coordinates.txt')
 eval_file = os.path.join(output_files_path, 'admix.eval')
 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file)
 smartpca_stats = do_smartpca(par_file)
+fix_names(name_map, [ind_file, evec_file])
 do_ploteig(evec_file, population_names)
 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))
 output_plot_file = os.path.join(output_files_path, 'PCA.pdf')
 shutil.copy2(plot_file, output_plot_file)

Mercurial > repos > miller-lab > genome_diversity

comparison pca.py @ 24:248b06e86022