Mercurial > repos > devteam > table_annovar
comparison replace_NA.py @ 3:5cb2020a097a draft
Uploaded
author | devteam |
---|---|
date | Wed, 12 Feb 2014 15:44:18 -0500 |
parents | |
children | d4e292ddda05 |
comparison
equal
deleted
inserted
replaced
2:9c75a9b5ecd2 | 3:5cb2020a097a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values. | |
4 # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement. | |
5 | |
6 import sys | |
7 import os | |
8 import tempfile | |
9 | |
10 # Constants. | |
11 SEPARATOR = '\t' | |
12 TARGET = 'NA' | |
13 REPLACEMENT = -1 | |
14 # List of known numerical columns. | |
15 NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all'] | |
16 | |
17 # Use tempfile to store data. | |
18 temp_out = tempfile.NamedTemporaryFile(delete=False) | |
19 | |
20 # Use first line to set up data structure and identify numerical columns. | |
21 first_line = sys.stdin.readline() | |
22 fields = first_line.strip().split(SEPARATOR) | |
23 numerical_cols = [] | |
24 for i, f in enumerate(fields): | |
25 if f in NUMERICAL_COLUMNS: | |
26 numerical_cols.append(i) | |
27 | |
28 # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements. | |
29 col_type_counts = [ [0, 0] for i in range( len(fields) ) ] | |
30 | |
31 # Set up function to process lines. | |
32 def process_line_fields(fields): | |
33 ''' | |
34 Process fields in a line. | |
35 ''' | |
36 for i, f in enumerate(fields): | |
37 # Ignore targets in calculation. | |
38 if f == TARGET: | |
39 continue | |
40 | |
41 # Assume it's a number. | |
42 type_index = 1 | |
43 try: | |
44 float(f) | |
45 except: | |
46 # Not a number. | |
47 type_index = 0 | |
48 col_type_counts[i][type_index] += 1 | |
49 | |
50 | |
51 # Process first line. | |
52 process_line_fields(fields) | |
53 temp_out.write(first_line) | |
54 | |
55 # Process N-1 lines. | |
56 for line in sys.stdin: | |
57 fields = line.strip().split(SEPARATOR) | |
58 process_line_fields(fields) | |
59 temp_out.write(line) | |
60 | |
61 # Close temp file so that it can be read. | |
62 temp_name = temp_out.name | |
63 temp_out.close() | |
64 | |
65 # Get column type based on label or consensus. | |
66 col_types = range(len(col_type_counts)) | |
67 for i, counts in enumerate(col_type_counts): | |
68 if i in numerical_cols: | |
69 col_type = 'number' | |
70 elif counts[0] > counts[1]: | |
71 col_type = 'string' | |
72 else: | |
73 col_type = 'number' | |
74 col_types[i] = col_type | |
75 | |
76 # Replace target in number columns. | |
77 for line in open(temp_name, 'r'): | |
78 fields = line.strip().split(SEPARATOR) | |
79 for i, f in enumerate(fields): | |
80 if fields[i] == TARGET and col_types[i] == 'number': | |
81 fields[i] = str(REPLACEMENT) | |
82 print SEPARATOR.join(fields) | |
83 | |
84 # Clean up temp file. | |
85 temp_out.close() | |
86 os.unlink(temp_out.name) | |
87 | |
88 | |
89 |