Mercurial > repos > devteam > table_annovar
comparison replace_NA.py @ 3:5cb2020a097a draft
Uploaded
| author | devteam |
|---|---|
| date | Wed, 12 Feb 2014 15:44:18 -0500 |
| parents | |
| children | d4e292ddda05 |
comparison
equal
deleted
inserted
replaced
| 2:9c75a9b5ecd2 | 3:5cb2020a097a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values. | |
| 4 # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement. | |
| 5 | |
| 6 import sys | |
| 7 import os | |
| 8 import tempfile | |
| 9 | |
| 10 # Constants. | |
| 11 SEPARATOR = '\t' | |
| 12 TARGET = 'NA' | |
| 13 REPLACEMENT = -1 | |
| 14 # List of known numerical columns. | |
| 15 NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all'] | |
| 16 | |
| 17 # Use tempfile to store data. | |
| 18 temp_out = tempfile.NamedTemporaryFile(delete=False) | |
| 19 | |
| 20 # Use first line to set up data structure and identify numerical columns. | |
| 21 first_line = sys.stdin.readline() | |
| 22 fields = first_line.strip().split(SEPARATOR) | |
| 23 numerical_cols = [] | |
| 24 for i, f in enumerate(fields): | |
| 25 if f in NUMERICAL_COLUMNS: | |
| 26 numerical_cols.append(i) | |
| 27 | |
| 28 # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements. | |
| 29 col_type_counts = [ [0, 0] for i in range( len(fields) ) ] | |
| 30 | |
| 31 # Set up function to process lines. | |
| 32 def process_line_fields(fields): | |
| 33 ''' | |
| 34 Process fields in a line. | |
| 35 ''' | |
| 36 for i, f in enumerate(fields): | |
| 37 # Ignore targets in calculation. | |
| 38 if f == TARGET: | |
| 39 continue | |
| 40 | |
| 41 # Assume it's a number. | |
| 42 type_index = 1 | |
| 43 try: | |
| 44 float(f) | |
| 45 except: | |
| 46 # Not a number. | |
| 47 type_index = 0 | |
| 48 col_type_counts[i][type_index] += 1 | |
| 49 | |
| 50 | |
| 51 # Process first line. | |
| 52 process_line_fields(fields) | |
| 53 temp_out.write(first_line) | |
| 54 | |
| 55 # Process N-1 lines. | |
| 56 for line in sys.stdin: | |
| 57 fields = line.strip().split(SEPARATOR) | |
| 58 process_line_fields(fields) | |
| 59 temp_out.write(line) | |
| 60 | |
| 61 # Close temp file so that it can be read. | |
| 62 temp_name = temp_out.name | |
| 63 temp_out.close() | |
| 64 | |
| 65 # Get column type based on label or consensus. | |
| 66 col_types = range(len(col_type_counts)) | |
| 67 for i, counts in enumerate(col_type_counts): | |
| 68 if i in numerical_cols: | |
| 69 col_type = 'number' | |
| 70 elif counts[0] > counts[1]: | |
| 71 col_type = 'string' | |
| 72 else: | |
| 73 col_type = 'number' | |
| 74 col_types[i] = col_type | |
| 75 | |
| 76 # Replace target in number columns. | |
| 77 for line in open(temp_name, 'r'): | |
| 78 fields = line.strip().split(SEPARATOR) | |
| 79 for i, f in enumerate(fields): | |
| 80 if fields[i] == TARGET and col_types[i] == 'number': | |
| 81 fields[i] = str(REPLACEMENT) | |
| 82 print SEPARATOR.join(fields) | |
| 83 | |
| 84 # Clean up temp file. | |
| 85 temp_out.close() | |
| 86 os.unlink(temp_out.name) | |
| 87 | |
| 88 | |
| 89 |
