Mercurial > repos > miller-lab > genome_diversity
diff specify.py @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 248b06e86022 |
children |
line wrap: on
line diff
--- a/specify.py Mon Jun 03 12:29:29 2013 -0400 +++ b/specify.py Mon Jul 15 10:47:35 2013 -0400 @@ -1,147 +1,69 @@ #!/usr/bin/env python +import gd_util import sys -import base64 - -def parse_args(args): - if len(args) < 3: - usage() - - input_file, output_file = args[1:3] - - individuals = [] - checkboxes = [] - strings = [] - - for arg in args[3:]: - if ':' in arg: - arg_type, arg = arg.split(':', 1) - else: - print >> sys.stderr, "unknown argument:", arg - usage() - - if arg_type == 'individual': - individuals.append(arg) - elif arg_type == 'checkbox': - checkboxes.append(arg) - elif arg_type == 'string': - strings.append(arg) - else: - print >> sys.stderr, "unknown argument:", arg - usage() - - return input_file, output_file, individuals, checkboxes, strings +from Population import Population -def usage(): - print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) - sys.exit(1) - -def parse_individuals(individuals): - ind_col2name = {} - ind_name2col = {} - - for individual in individuals: - if ':' in individual: - column, name = individual.split(':', 1) - else: - print >> sys.stderr, "invalid individual specification:", individual - usage() +################################################################################ - try: - column = int(column) - except: - print "individual column is not an integer:", individual - usage() - - if column not in ind_col2name: - ind_col2name[column] = name - else: - if ind_col2name[column] != name: - print "duplicate individual column:", name, column, ind_col2name[column] - usage() - - if name not in ind_name2col: - ind_name2col[name] = [column] - elif column not in ind_name2col[name]: - ind_name2col[name].append(column) - - return ind_col2name, ind_name2col - -def parse_checkboxes(checkboxes, ind_col2name): +def parse_string(str_arg, ind_token2col): columns = [] - for checkbox in checkboxes: - if ':' in checkbox: - column, name = checkbox.split(':', 1) - else: - print >> sys.stderr, "invalid checkbox specification:", checkbox - usage() + string = gd_util.unwrap_string(str_arg) + tokens = find_tokens(string, ind_token2col) - try: - column = int(column) - except: - print "checkbox column is not an integer:", checkbox - usage() - - if column not in ind_col2name: - print "individual not in SNP table:", name - usage() - - if column not in columns: - columns.append(column) + for token in tokens: + col = ind_token2col[token] + if col not in columns: + columns.append(col) return columns -def parse_strings(strings, ind_col2name, ind_name2col): - columns = [] - - for string in strings: - try: - decoded = base64.b64decode(string) - except: - print >> sys.stderr, "invalid base64 string:", string - usage() - - names = find_names(decoded, ind_name2col.keys()) - for name in names: - cols = ind_name2col[name] - if len(cols) == 1: - col = cols[0] - if col not in columns: - columns.append(col) - else: - print >> sys.stderr, "name with multiple columns:", name - usage() - - return columns - -def find_names(string, names): +def find_tokens(string, tokens): rv = [] - for name in names: - if name in string: - if name not in rv: - rv.append(name) + for token in tokens: + if token in string: + if token not in rv: + rv.append(token) return rv +################################################################################ +if len(sys.argv) != 6: + gd_util.die('Usage') +input, output, ind_arg, cb_arg, str_arg = sys.argv[1:] -input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) -ind_col2name, ind_name2col = parse_individuals(individuals) -cb_cols = parse_checkboxes(checkboxes, ind_col2name) -str_cols = parse_strings(strings, ind_col2name, ind_name2col) +p_total = Population() +p_total.from_wrapped_dict(ind_arg) + +p_cb = Population() +p_cb.from_wrapped_dict(cb_arg) + +if not p_total.is_superset(p_cb): + gd_util.die('There is a checked individual that does not appear in the SNP table') + +################################################################################ -out_cols = cb_cols -for col in str_cols: - if col not in out_cols: - out_cols.append(col) +ind_col2name = {} +ind_token2col = {} +for col in p_total.column_list(): + individual = p_total.individual_with_column(col) + name = individual.name + ind_col2name[col] = name + first_token = name.split()[0] + if first_token not in ind_token2col: + ind_token2col[first_token] = col + else: + gd_util.die('duplicate first token: {0}'.format(first_token)) -with open(output_file, 'w') as fh: - for col in sorted(out_cols): - print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) +out_cols = p_cb.column_list() +str_cols = parse_string(str_arg, ind_token2col) + +with open(output, 'w') as fh: + for col in sorted(ind_col2name.keys()): + if col in out_cols or col in str_cols: + print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) sys.exit(0) - - -