Mercurial > repos > miller-lab > genome_diversity
view make_gd_file.py @ 32:03c22b722882
remove BeautifulSoup dependency
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:54:23 -0400 |
parents | 8997f2ca8c7a |
children |
line wrap: on
line source
#!/usr/bin/env python import base64 import json import math import re import sys identifier_regex = re.compile('[0-9A-Z_a-z]+$') def unwrap_column_names(string): column_names = [] string = unwrap_string(string) for line in string.split('\n'): line = line.strip() if is_identifier(line): column_names.append(line) else: die('invalid column name format: {}'.format(line)) return column_names def unwrap_string(string): try: decoded = base64.b64decode(string) except: die('invalid base64 string: {}'.format(string)) return decoded def is_identifier(string): match = identifier_regex.match(string) if match: return True else: return False def read_individual_names(filename): tokens = [] names = [] with open(filename) as fh: for line in fh: line = line.rstrip('\r\n') elems = line.split() columns = len(elems) if columns == 0: continue first_token = elems[0] if columns == 1: name = first_token else: keywords = ' '.join(elems[1:]) name = ' '.join([first_token, keywords]) if first_token not in tokens: tokens.append(first_token) names.append(name) else: die('duplicate first column entry in Names dataset: {}'.format(first_token)) return names def fold_line(line, maxlen, prefix): prefix_len = len(prefix) lines = [] while len(line) > maxlen: split_points = [] state = 0 for i in range(maxlen - prefix_len): c = line[i] if state == 0: if c == '"': state = 1 elif c in [ '{', ':', ',', '}', '[', ']' ]: split_points.append(i) elif state == 1: if c == '"': state = 0 elif c == '\\': state = 2 elif state == 2: state = 1 idx = split_points[-1] lines.append('{0}{1}'.format(prefix, line[:idx+1])) line = line[idx+1:] lines.append('{0}{1}'.format(prefix, line)) return lines def die(message): print >> sys.stderr, message sys.exit(1) ################################################################################ type_to_columns = { 'gd_snp':4, 'gd_genotype':1 } if len(sys.argv) != 12: print >> sys.stderr, 'Usage' sys.exit(1) input, scaffold_col, pos_col, ref_col, rPos_col, preamble_arg, names, species_arg, dbkey, output_type, output = sys.argv[1:12] preamble_column_names = unwrap_column_names(preamble_arg) first_individual_column = len(preamble_column_names) + 1 individual_names = read_individual_names(names) species = unwrap_string(species_arg) if not is_identifier(species): die('invalid species format: {}'.format(species)) if not output_type in type_to_columns: die('unknown output type: {}'.format(output_type)) columns_per_individual = type_to_columns[output_type] jdict = {} column_names = preamble_column_names[:] for i in range(1, len(individual_names) + 1): if output_type == 'gd_snp': column_names.append('{}A'.format(i)) column_names.append('{}B'.format(i)) column_names.append('{}G'.format(i)) column_names.append('{}Q'.format(i)) elif output_type == 'gd_genotype': column_names.append('{}G'.format(i)) else: die('unknown output type: {}'.format(output_type)) jdict['column_names'] = column_names individuals = [] for pos, individual in enumerate(individual_names): col = first_individual_column + pos * columns_per_individual individuals.append([individual, col]) jdict['individuals'] = individuals jdict['scaffold'] = int(scaffold_col) jdict['pos'] = int(pos_col) jdict['ref'] = int(ref_col) jdict['rPos'] = int(rPos_col) jdict['species'] = species jdict['dbkey'] = dbkey json_string = json.dumps(jdict, separators=(',',':'), sort_keys=True) min_cols = len(column_names) pos_col = int(pos_col) - 1 rPos_col = int(rPos_col) - 1 def is_int(string): try: int(string) return True except ValueError: return False with open(output, 'w') as ofh: lines = fold_line(json_string, 200, '#') for line in lines: print >> ofh, line with open(input) as fh: line_number = 0 for line in fh: line_number += 1 if line[0] == '#': continue line = line.rstrip('\r\n') elems = line.split('\t') if len(elems) < min_cols: die('Too few columns on line {0} of input file. Expecting {1}, saw {2}.'.format(line_number, min_cols, len(elems))) if not is_int(elems[pos_col]): die('bad pos on line {0} column {1} of input file: {2}'.format(line_number, pos_col+1, elems[pos_col])) if not is_int(elems[rPos_col]): die('bad rPos on line {0} column {1} of input file: {2}'.format(line_number, rPos_col+1, elems[rPos_col])) print >> ofh, line