Mercurial > repos > devteam > table_annovar
view replace_NA.py @ 6:091154194ce8 draft
Uploaded
author | devteam |
---|---|
date | Wed, 12 Feb 2014 16:32:16 -0500 |
parents | 5cb2020a097a |
children | d4e292ddda05 |
line wrap: on
line source
#!/usr/bin/env python # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values. # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement. import sys import os import tempfile # Constants. SEPARATOR = '\t' TARGET = 'NA' REPLACEMENT = -1 # List of known numerical columns. NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all'] # Use tempfile to store data. temp_out = tempfile.NamedTemporaryFile(delete=False) # Use first line to set up data structure and identify numerical columns. first_line = sys.stdin.readline() fields = first_line.strip().split(SEPARATOR) numerical_cols = [] for i, f in enumerate(fields): if f in NUMERICAL_COLUMNS: numerical_cols.append(i) # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements. col_type_counts = [ [0, 0] for i in range( len(fields) ) ] # Set up function to process lines. def process_line_fields(fields): ''' Process fields in a line. ''' for i, f in enumerate(fields): # Ignore targets in calculation. if f == TARGET: continue # Assume it's a number. type_index = 1 try: float(f) except: # Not a number. type_index = 0 col_type_counts[i][type_index] += 1 # Process first line. process_line_fields(fields) temp_out.write(first_line) # Process N-1 lines. for line in sys.stdin: fields = line.strip().split(SEPARATOR) process_line_fields(fields) temp_out.write(line) # Close temp file so that it can be read. temp_name = temp_out.name temp_out.close() # Get column type based on label or consensus. col_types = range(len(col_type_counts)) for i, counts in enumerate(col_type_counts): if i in numerical_cols: col_type = 'number' elif counts[0] > counts[1]: col_type = 'string' else: col_type = 'number' col_types[i] = col_type # Replace target in number columns. for line in open(temp_name, 'r'): fields = line.strip().split(SEPARATOR) for i, f in enumerate(fields): if fields[i] == TARGET and col_types[i] == 'number': fields[i] = str(REPLACEMENT) print SEPARATOR.join(fields) # Clean up temp file. temp_out.close() os.unlink(temp_out.name)