# HG changeset patch # User devteam # Date 1392237858 18000 # Node ID 5cb2020a097ab4388d27ee1a70b4a98650ff01b6 # Parent 9c75a9b5ecd2d8580f916481838be8dc27209ce7 Uploaded diff -r 9c75a9b5ecd2 -r 5cb2020a097a replace_NA.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/replace_NA.py Wed Feb 12 15:44:18 2014 -0500 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values. +# Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement. + +import sys +import os +import tempfile + +# Constants. +SEPARATOR = '\t' +TARGET = 'NA' +REPLACEMENT = -1 +# List of known numerical columns. +NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all'] + +# Use tempfile to store data. +temp_out = tempfile.NamedTemporaryFile(delete=False) + +# Use first line to set up data structure and identify numerical columns. +first_line = sys.stdin.readline() +fields = first_line.strip().split(SEPARATOR) +numerical_cols = [] +for i, f in enumerate(fields): + if f in NUMERICAL_COLUMNS: + numerical_cols.append(i) + +# Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements. +col_type_counts = [ [0, 0] for i in range( len(fields) ) ] + +# Set up function to process lines. +def process_line_fields(fields): + ''' + Process fields in a line. + ''' + for i, f in enumerate(fields): + # Ignore targets in calculation. + if f == TARGET: + continue + + # Assume it's a number. + type_index = 1 + try: + float(f) + except: + # Not a number. + type_index = 0 + col_type_counts[i][type_index] += 1 + + +# Process first line. +process_line_fields(fields) +temp_out.write(first_line) + +# Process N-1 lines. +for line in sys.stdin: + fields = line.strip().split(SEPARATOR) + process_line_fields(fields) + temp_out.write(line) + +# Close temp file so that it can be read. +temp_name = temp_out.name +temp_out.close() + +# Get column type based on label or consensus. +col_types = range(len(col_type_counts)) +for i, counts in enumerate(col_type_counts): + if i in numerical_cols: + col_type = 'number' + elif counts[0] > counts[1]: + col_type = 'string' + else: + col_type = 'number' + col_types[i] = col_type + +# Replace target in number columns. +for line in open(temp_name, 'r'): + fields = line.strip().split(SEPARATOR) + for i, f in enumerate(fields): + if fields[i] == TARGET and col_types[i] == 'number': + fields[i] = str(REPLACEMENT) + print SEPARATOR.join(fields) + +# Clean up temp file. +temp_out.close() +os.unlink(temp_out.name) + + +