Mercurial > repos > miller-lab > genome_diversity
view specify.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | |
children | 8997f2ca8c7a |
line wrap: on
line source
#!/usr/bin/env python import sys import base64 def parse_args(args): if len(args) < 3: usage() input_file, output_file = args[1:3] individuals = [] checkboxes = [] strings = [] for arg in args[3:]: if ':' in arg: arg_type, arg = arg.split(':', 1) else: print >> sys.stderr, "unknown argument:", arg usage() if arg_type == 'individual': individuals.append(arg) elif arg_type == 'checkbox': checkboxes.append(arg) elif arg_type == 'string': strings.append(arg) else: print >> sys.stderr, "unknown argument:", arg usage() return input_file, output_file, individuals, checkboxes, strings def usage(): print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) sys.exit(1) def parse_individuals(individuals): ind_col2name = {} ind_name2col = {} for individual in individuals: if ':' in individual: column, name = individual.split(':', 1) else: print >> sys.stderr, "invalid individual specification:", individual usage() try: column = int(column) except: print "individual column is not an integer:", individual usage() if column not in ind_col2name: ind_col2name[column] = name else: if ind_col2name[column] != name: print "duplicate individual column:", name, column, ind_col2name[column] usage() if name not in ind_name2col: ind_name2col[name] = [column] elif column not in ind_name2col[name]: ind_name2col[name].append(column) return ind_col2name, ind_name2col def parse_checkboxes(checkboxes, ind_col2name): columns = [] for checkbox in checkboxes: if ':' in checkbox: column, name = checkbox.split(':', 1) else: print >> sys.stderr, "invalid checkbox specification:", checkbox usage() try: column = int(column) except: print "checkbox column is not an integer:", checkbox usage() if column not in ind_col2name: print "individual not in SNP table:", name usage() if column not in columns: columns.append(column) return columns def parse_strings(strings, ind_col2name, ind_name2col): columns = [] for string in strings: try: decoded = base64.b64decode(string) except: print >> sys.stderr, "invalid base64 string:", string usage() names = find_names(decoded, ind_name2col.keys()) for name in names: cols = ind_name2col[name] if len(cols) == 1: col = cols[0] if col not in columns: columns.append(col) else: print >> sys.stderr, "name with multiple columns:", name usage() return columns def find_names(string, names): rv = [] for name in names: if name in string: if name not in rv: rv.append(name) return rv input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) ind_col2name, ind_name2col = parse_individuals(individuals) cb_cols = parse_checkboxes(checkboxes, ind_col2name) str_cols = parse_strings(strings, ind_col2name, ind_name2col) out_cols = cb_cols for col in str_cols: if col not in out_cols: out_cols.append(col) with open(output_file, 'w') as fh: for col in sorted(out_cols): print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) sys.exit(0)