Mercurial > repos > miller-lab > genome_diversity
diff specify.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | |
children | 8997f2ca8c7a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/specify.py Tue May 28 16:24:19 2013 -0400 @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +import sys +import base64 + +def parse_args(args): + if len(args) < 3: + usage() + + input_file, output_file = args[1:3] + + individuals = [] + checkboxes = [] + strings = [] + + for arg in args[3:]: + if ':' in arg: + arg_type, arg = arg.split(':', 1) + else: + print >> sys.stderr, "unknown argument:", arg + usage() + + if arg_type == 'individual': + individuals.append(arg) + elif arg_type == 'checkbox': + checkboxes.append(arg) + elif arg_type == 'string': + strings.append(arg) + else: + print >> sys.stderr, "unknown argument:", arg + usage() + + return input_file, output_file, individuals, checkboxes, strings + +def usage(): + print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0]) + sys.exit(1) + +def parse_individuals(individuals): + ind_col2name = {} + ind_name2col = {} + + for individual in individuals: + if ':' in individual: + column, name = individual.split(':', 1) + else: + print >> sys.stderr, "invalid individual specification:", individual + usage() + + try: + column = int(column) + except: + print "individual column is not an integer:", individual + usage() + + if column not in ind_col2name: + ind_col2name[column] = name + else: + if ind_col2name[column] != name: + print "duplicate individual column:", name, column, ind_col2name[column] + usage() + + if name not in ind_name2col: + ind_name2col[name] = [column] + elif column not in ind_name2col[name]: + ind_name2col[name].append(column) + + return ind_col2name, ind_name2col + +def parse_checkboxes(checkboxes, ind_col2name): + columns = [] + + for checkbox in checkboxes: + if ':' in checkbox: + column, name = checkbox.split(':', 1) + else: + print >> sys.stderr, "invalid checkbox specification:", checkbox + usage() + + try: + column = int(column) + except: + print "checkbox column is not an integer:", checkbox + usage() + + if column not in ind_col2name: + print "individual not in SNP table:", name + usage() + + if column not in columns: + columns.append(column) + + return columns + +def parse_strings(strings, ind_col2name, ind_name2col): + columns = [] + + for string in strings: + try: + decoded = base64.b64decode(string) + except: + print >> sys.stderr, "invalid base64 string:", string + usage() + + names = find_names(decoded, ind_name2col.keys()) + for name in names: + cols = ind_name2col[name] + if len(cols) == 1: + col = cols[0] + if col not in columns: + columns.append(col) + else: + print >> sys.stderr, "name with multiple columns:", name + usage() + + return columns + +def find_names(string, names): + rv = [] + for name in names: + if name in string: + if name not in rv: + rv.append(name) + return rv + + + + +input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv) +ind_col2name, ind_name2col = parse_individuals(individuals) +cb_cols = parse_checkboxes(checkboxes, ind_col2name) +str_cols = parse_strings(strings, ind_col2name, ind_name2col) + +out_cols = cb_cols +for col in str_cols: + if col not in out_cols: + out_cols.append(col) + +with open(output_file, 'w') as fh: + for col in sorted(out_cols): + print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']]) + +sys.exit(0) + + + +