Mercurial > repos > miller-lab > genome_diversity
annotate prepare_population_structure.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | 8997f2ca8c7a |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
3 import gd_util |
0 | 4 import os |
5 import shutil | |
6 import sys | |
7 from Population import Population | |
8 import gd_composite | |
9 | |
10 ################################################################################ | |
11 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
12 def do_import(filename, files_path, min_reads, min_qual, min_spacing, using_info, population_list): |
0 | 13 info_page = gd_composite.InfoPage() |
14 info_page.set_title('Prepare to look for population structure Galaxy Composite Dataset') | |
15 | |
16 display_file = gd_composite.DisplayFile() | |
17 display_value = gd_composite.DisplayValue() | |
18 | |
19 out_ped = gd_composite.Parameter(name='admix.ped', value='admix.ped', display_type=display_file) | |
20 out_map = gd_composite.Parameter(name='admix.map', value='admix.map', display_type=display_file) | |
21 out_use = gd_composite.Parameter(description=using_info, display_type=display_value) | |
22 | |
23 info_page.add_output_parameter(out_ped) | |
24 info_page.add_output_parameter(out_map) | |
25 info_page.add_output_parameter(out_use) | |
26 | |
27 in_min_reads = gd_composite.Parameter(description='Minimum reads covering a SNP, per individual', value=min_reads, display_type=display_value) | |
28 in_min_qual = gd_composite.Parameter(description='Minimum quality value, per individual', value=min_qual, display_type=display_value) | |
29 in_min_spacing = gd_composite.Parameter(description='Minimum spacing between SNPs on the same scaffold', value=min_spacing, display_type=display_value) | |
30 | |
31 info_page.add_input_parameter(in_min_reads) | |
32 info_page.add_input_parameter(in_min_qual) | |
33 info_page.add_input_parameter(in_min_spacing) | |
34 | |
35 misc_populations = gd_composite.Parameter(name='Populations', value=population_list, display_type=gd_composite.DisplayPopulationList()) | |
36 info_page.add_misc(misc_populations) | |
37 | |
38 with open(filename, 'w') as ofh: | |
39 print >> ofh, info_page.render() | |
40 | |
41 ################################################################################ | |
42 | |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
43 if len(sys.argv) < 10: |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
44 gd_util.die('Usage') |
0 | 45 |
46 # parse command line | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
47 input_snp_filename, input_type, min_reads, min_qual, min_spacing, output_filename, output_files_path, ind_arg = sys.argv[1:9] |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
48 args = sys.argv[9:] |
0 | 49 |
50 population_files = [] | |
51 all_individuals = False | |
52 | |
53 for arg in args: | |
54 if arg == 'all_individuals': | |
55 all_individuals = True | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
56 elif len(arg) > 11 and arg[:11] == 'population:': |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
57 file, name = arg[11:].split(':', 1) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
58 population_files.append((file, name)) |
0 | 59 |
60 p_total = Population() | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
61 p_total.from_wrapped_dict(ind_arg) |
0 | 62 |
63 individual_population = {} | |
64 population_list = [] | |
65 | |
66 if all_individuals: | |
67 p1 = p_total | |
68 p1.name = 'All Individuals' | |
69 population_list.append(p1) | |
70 else: | |
71 p1 = Population() | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
72 for file, name in population_files: |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
73 this_pop = Population(name) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
74 this_pop.from_population_file(file) |
0 | 75 population_list.append(this_pop) |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
76 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
77 for tag in this_pop.tag_list(): |
0 | 78 if tag not in individual_population: |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
79 individual_population[tag] = name |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
80 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
81 # add individuals from this file to p1 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
82 p1.from_population_file(file) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
83 |
0 | 84 |
85 if not p_total.is_superset(p1): | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
86 gd_util.die('There is an individual in the population that is not in the SNP table') |
0 | 87 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
88 ################################################################################ |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
89 |
0 | 90 prog = 'admix_prep' |
91 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
92 args = [ prog ] |
0 | 93 args.append(input_snp_filename) |
94 args.append(min_reads) | |
95 args.append(min_qual) | |
96 args.append(min_spacing) | |
97 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
98 for tag in p1.tag_list(): |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
99 if input_type == 'gd_genotype': |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
100 column, name = tag.split(':', 1) |
24
248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
Richard Burhans <burhans@bx.psu.edu>
parents:
0
diff
changeset
|
101 tag = '{0}:{1}'.format(int(column) - 2, name) |
0 | 102 args.append(tag) |
103 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
104 stdoutdata, stderrdata = gd_util.run_program(prog, args) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
105 using_info = stdoutdata.rstrip('\r\n') |
0 | 106 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
107 ################################################################################ |
0 | 108 |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
109 gd_util.mkdir_p(output_files_path) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
110 |
0 | 111 output_ped_filename = os.path.join(output_files_path, 'admix.ped') |
112 output_map_filename = os.path.join(output_files_path, 'admix.map') | |
113 shutil.copy2('admix.ped', output_ped_filename) | |
114 shutil.copy2('admix.map', output_map_filename) | |
115 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
24
diff
changeset
|
116 do_import(output_filename, output_files_path, min_reads, min_qual, min_spacing, using_info, population_list) |
0 | 117 sys.exit(0) |
118 |