Mercurial > repos > miller-lab > genome_diversity
comparison specify.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | |
children | 8997f2ca8c7a |
comparison
equal
deleted
inserted
replaced
23:66a183c44dd5 | 24:248b06e86022 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import base64 | |
5 | |
6 def parse_args(args): | |
7 if len(args) < 3: | |
8 usage() | |
9 | |
10 input_file, output_file = args[1:3] | |
11 | |
12 individuals = [] | |
13 checkboxes = [] | |
14 strings = [] | |
15 | |
16 for arg in args[3:]: | |
17 if ':' in arg: | |
18 arg_type, arg = arg.split(':', 1) | |
19 else: | |
20 print >> sys.stderr, "unknown argument:", arg | |
21 usage() | |
22 | |
23 if arg_type == 'individual': | |
24 individuals.append(arg) | |
25 elif arg_type == 'checkbox': | |
26 checkboxes.append(arg) | |
27 elif arg_type == 'string': | |
28 strings.append(arg) | |
29 else: | |
30 print >> sys.stderr, "unknown argument:", arg | |
31 usage() | |
32 | |
33 return input_file, output_file, individuals, checkboxes, strings | |
34 | |
35 def usage(): | |
36 print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) | |
37 sys.exit(1) | |
38 | |
39 def parse_individuals(individuals): | |
40 ind_col2name = {} | |
41 ind_name2col = {} | |
42 | |
43 for individual in individuals: | |
44 if ':' in individual: | |
45 column, name = individual.split(':', 1) | |
46 else: | |
47 print >> sys.stderr, "invalid individual specification:", individual | |
48 usage() | |
49 | |
50 try: | |
51 column = int(column) | |
52 except: | |
53 print "individual column is not an integer:", individual | |
54 usage() | |
55 | |
56 if column not in ind_col2name: | |
57 ind_col2name[column] = name | |
58 else: | |
59 if ind_col2name[column] != name: | |
60 print "duplicate individual column:", name, column, ind_col2name[column] | |
61 usage() | |
62 | |
63 if name not in ind_name2col: | |
64 ind_name2col[name] = [column] | |
65 elif column not in ind_name2col[name]: | |
66 ind_name2col[name].append(column) | |
67 | |
68 return ind_col2name, ind_name2col | |
69 | |
70 def parse_checkboxes(checkboxes, ind_col2name): | |
71 columns = [] | |
72 | |
73 for checkbox in checkboxes: | |
74 if ':' in checkbox: | |
75 column, name = checkbox.split(':', 1) | |
76 else: | |
77 print >> sys.stderr, "invalid checkbox specification:", checkbox | |
78 usage() | |
79 | |
80 try: | |
81 column = int(column) | |
82 except: | |
83 print "checkbox column is not an integer:", checkbox | |
84 usage() | |
85 | |
86 if column not in ind_col2name: | |
87 print "individual not in SNP table:", name | |
88 usage() | |
89 | |
90 if column not in columns: | |
91 columns.append(column) | |
92 | |
93 return columns | |
94 | |
95 def parse_strings(strings, ind_col2name, ind_name2col): | |
96 columns = [] | |
97 | |
98 for string in strings: | |
99 try: | |
100 decoded = base64.b64decode(string) | |
101 except: | |
102 print >> sys.stderr, "invalid base64 string:", string | |
103 usage() | |
104 | |
105 names = find_names(decoded, ind_name2col.keys()) | |
106 for name in names: | |
107 cols = ind_name2col[name] | |
108 if len(cols) == 1: | |
109 col = cols[0] | |
110 if col not in columns: | |
111 columns.append(col) | |
112 else: | |
113 print >> sys.stderr, "name with multiple columns:", name | |
114 usage() | |
115 | |
116 return columns | |
117 | |
118 def find_names(string, names): | |
119 rv = [] | |
120 for name in names: | |
121 if name in string: | |
122 if name not in rv: | |
123 rv.append(name) | |
124 return rv | |
125 | |
126 | |
127 | |
128 | |
129 input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) | |
130 ind_col2name, ind_name2col = parse_individuals(individuals) | |
131 cb_cols = parse_checkboxes(checkboxes, ind_col2name) | |
132 str_cols = parse_strings(strings, ind_col2name, ind_name2col) | |
133 | |
134 out_cols = cb_cols | |
135 for col in str_cols: | |
136 if col not in out_cols: | |
137 out_cols.append(col) | |
138 | |
139 with open(output_file, 'w') as fh: | |
140 for col in sorted(out_cols): | |
141 print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) | |
142 | |
143 sys.exit(0) | |
144 | |
145 | |
146 | |
147 |