annotate tabular_label_convert/tabular_label_convert.py @ 0:1f93906c2945 draft default tip

Uploaded
author kellrott
date Sun, 18 Nov 2012 01:42:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
1 #!/usr/bin/env python
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
2
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
3 import os
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
4 import csv
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
5 import sys
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
6 import array
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
7 import math
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
8 from copy import copy
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
9 from argparse import ArgumentParser
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
10
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
11
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
12 class FloatMatrix:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
13 def __init__(self):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
14 self.corner_name = "probe"
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
15 self.data = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
16 self.nrows = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
17 self.ncols = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
18 self.rowmap = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
19 self.colmap = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
20
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
21 def read(self, handle):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
22 header = None
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
23 for line in handle:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
24 row = line.rstrip().split("\t")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
25 if header is None:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
26 header = row
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
27 self.data = array.array("f")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
28 self.colmap = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
29 self.rowmap = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
30 self.ncols = len(row) - 1
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
31 self.nrows = 0
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
32 for i, c in enumerate(row[1:]):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
33 self.colmap[c] = i
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
34 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
35 if len(row) - 1 != self.ncols:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
36 raise DataException("Misformed matrix")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
37 self.rowmap[row[0]] = len(self.rowmap)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
38 a = []
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
39 for v in row[1:]:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
40 try:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
41 a.append(float(v))
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
42 except ValueError:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
43 a.append(float('Nan'))
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
44 self.data.extend(a)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
45 self.nrows += 1
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
46
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
47 def init_blank(self, rows, cols):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
48 self.data = array.array("f")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
49 self.colmap = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
50 for i,c in enumerate(cols):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
51 self.colmap[c] = i
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
52 self.rowmap = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
53 for i,r in enumerate(rows):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
54 self.rowmap[r] = i
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
55 self.ncols = len(cols)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
56 self.nrows = len(rows)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
57 for i in range(self.nrows):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
58 self.data.extend([float('nan')] * self.ncols)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
59
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
60 def get_value(self, row_name, col_name):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
61 return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
62
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
63 def set_value(self, row_name, col_name, value):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
64 self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
65
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
66 def get_row(self, row_name):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
67 return self.data[ self.rowmap[row_name] * self.ncols : (self.rowmap[row_name]+1) * self.ncols ]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
68
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
69 def get_cols(self):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
70 out = self.colmap.keys()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
71 return sorted(out, key=self.colmap.get)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
72
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
73 def has_row(self, row):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
74 return row in self.rowmap
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
75
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
76 def has_col(self, col):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
77 return col in self.colmap
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
78
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
79 def get_rows(self):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
80 out = self.rowmap.keys()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
81 return sorted(out, key=self.rowmap.get)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
82
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
83 def write(self, handle, missing='NA'):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
84 write = csv.writer(handle, delimiter="\t", lineterminator='\n')
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
85 col_list = self.get_cols()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
86
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
87 write.writerow([self.corner_name] + col_list)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
88 for rowName in self.rowmap:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
89 out = [rowName]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
90 row = self.get_row(rowName)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
91 for col in col_list:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
92 val = row[self.colmap[col]]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
93 if val is None or math.isnan(val):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
94 val = missing
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
95 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
96 val = "%.5f" % (val)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
97 out.append(val)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
98 write.writerow(out)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
99
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
100
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
101 def median(inList):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
102 """calculates median"""
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
103 cList = copy(inList)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
104 if len(cList) == 0:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
105 median = float("nan")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
106 elif len(cList) == 1:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
107 return cList[0]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
108 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
109 cList.sort()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
110 if len(cList)%2 == 1:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
111 median = cList[len(cList)/2]
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
112 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
113 median = (cList[len(cList)/2]+cList[(len(cList)/2)-1])/2.0
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
114 return (median)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
115
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
116 def mean(inList):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
117 return sum(inList) / float(len(inList))
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
118
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
119 def aliasRemap(inputMatrix, aliasMap, mode, combine_func):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
120 """
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
121 Given a inputMatrix and an alias map, create a new genomic matrix
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
122 with the probes from the original matrix remapped to the connected aliases
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
123 from the map
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
124 """
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
125
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
126 if mode == "row":
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
127 i_am = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
128 for label in aliasMap:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
129 if inputMatrix.has_row(label):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
130 for alias in aliasMap[label]:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
131 if alias not in i_am:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
132 i_am[alias] = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
133 i_am[alias][label] = True
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
134
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
135 out = FloatMatrix()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
136 out.init_blank( rows=i_am.keys(), cols=inputMatrix.get_cols() )
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
137 for a in i_am:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
138 for sample in inputMatrix.get_cols():
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
139 o = []
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
140 for p in i_am[a]:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
141 if inputMatrix.has_row(p):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
142 o.append( inputMatrix.get_value( col_name=sample, row_name=p) )
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
143 if len(o):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
144 out.set_value(col_name=sample, row_name=a, value=combine_func(o))
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
145 return out
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
146
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
147 if mode == "col":
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
148 i_am = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
149 for label in aliasMap:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
150 if inputMatrix.has_col(label):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
151 for alias in aliasMap[label]:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
152 if alias not in i_am:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
153 i_am[alias] = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
154 i_am[alias][label] = True
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
155
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
156 out = FloatMatrix()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
157 out.init_blank( cols=i_am.keys(), rows=inputMatrix.get_rows() )
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
158 for a in i_am:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
159 for r in inputMatrix.get_rows():
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
160 o = []
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
161 for label in i_am[a]:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
162 if inputMatrix.has_col(label):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
163 o.append( inputMatrix.get_value( row_name=r, col_name=label) )
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
164 if len(o):
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
165 out.set_value(col_name=a, row_name=r, value=combine_func(o))
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
166 return out
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
167
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
168
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
169 combine_map = {
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
170 "mean" : mean,
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
171 "median" : median,
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
172 "max" : max,
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
173 "min" : min
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
174 }
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
175
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
176 if __name__ == "__main__":
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
177 parser = ArgumentParser()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
178
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
179 parser.add_argument("-m", "--mode", dest="mode", help="Row/Column mode", default="row")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
180 parser.add_argument("-c", "--combine", dest="combine", help="Value Combine Method", default="mean")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
181 parser.add_argument("-o", "--output", help="Output file", default=None)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
182 parser.add_argument("inTab", help="Input tabular file", default=None)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
183 parser.add_argument("aliasMap", help="Input alias map", default=None)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
184
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
185 args = parser.parse_args()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
186
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
187 mtx = FloatMatrix()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
188 handle = open(args.inTab)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
189 mtx.read(handle)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
190 handle.close()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
191
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
192 aliasMap = {}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
193 handle = open(args.aliasMap)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
194 for line in handle:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
195 tmp = line.rstrip().split("\t")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
196 if tmp[0] not in aliasMap:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
197 aliasMap[tmp[0]] = {tmp[1] : True}
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
198 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
199 aliasMap[tmp[0]][tmp[1]] = True
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
200 handle.close()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
201
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
202 out = aliasRemap(mtx, aliasMap, args.mode, combine_map[args.combine])
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
203 if args.output is None:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
204 handle = sys.stdout
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
205 else:
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
206 handle = open(args.output, "w")
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
207 out.write(handle)
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
208 handle.close()
1f93906c2945 Uploaded
kellrott
parents:
diff changeset
209