comparison tabular_label_convert/tabular_label_convert.py @ 0:1f93906c2945 draft default tip

Uploaded
author kellrott
date Sun, 18 Nov 2012 01:42:40 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f93906c2945
1 #!/usr/bin/env python
2
3 import os
4 import csv
5 import sys
6 import array
7 import math
8 from copy import copy
9 from argparse import ArgumentParser
10
11
12 class FloatMatrix:
13 def __init__(self):
14 self.corner_name = "probe"
15 self.data = None
16 self.nrows = None
17 self.ncols = None
18 self.rowmap = None
19 self.colmap = None
20
21 def read(self, handle):
22 header = None
23 for line in handle:
24 row = line.rstrip().split("\t")
25 if header is None:
26 header = row
27 self.data = array.array("f")
28 self.colmap = {}
29 self.rowmap = {}
30 self.ncols = len(row) - 1
31 self.nrows = 0
32 for i, c in enumerate(row[1:]):
33 self.colmap[c] = i
34 else:
35 if len(row) - 1 != self.ncols:
36 raise DataException("Misformed matrix")
37 self.rowmap[row[0]] = len(self.rowmap)
38 a = []
39 for v in row[1:]:
40 try:
41 a.append(float(v))
42 except ValueError:
43 a.append(float('Nan'))
44 self.data.extend(a)
45 self.nrows += 1
46
47 def init_blank(self, rows, cols):
48 self.data = array.array("f")
49 self.colmap = {}
50 for i,c in enumerate(cols):
51 self.colmap[c] = i
52 self.rowmap = {}
53 for i,r in enumerate(rows):
54 self.rowmap[r] = i
55 self.ncols = len(cols)
56 self.nrows = len(rows)
57 for i in range(self.nrows):
58 self.data.extend([float('nan')] * self.ncols)
59
60 def get_value(self, row_name, col_name):
61 return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ]
62
63 def set_value(self, row_name, col_name, value):
64 self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value
65
66 def get_row(self, row_name):
67 return self.data[ self.rowmap[row_name] * self.ncols : (self.rowmap[row_name]+1) * self.ncols ]
68
69 def get_cols(self):
70 out = self.colmap.keys()
71 return sorted(out, key=self.colmap.get)
72
73 def has_row(self, row):
74 return row in self.rowmap
75
76 def has_col(self, col):
77 return col in self.colmap
78
79 def get_rows(self):
80 out = self.rowmap.keys()
81 return sorted(out, key=self.rowmap.get)
82
83 def write(self, handle, missing='NA'):
84 write = csv.writer(handle, delimiter="\t", lineterminator='\n')
85 col_list = self.get_cols()
86
87 write.writerow([self.corner_name] + col_list)
88 for rowName in self.rowmap:
89 out = [rowName]
90 row = self.get_row(rowName)
91 for col in col_list:
92 val = row[self.colmap[col]]
93 if val is None or math.isnan(val):
94 val = missing
95 else:
96 val = "%.5f" % (val)
97 out.append(val)
98 write.writerow(out)
99
100
101 def median(inList):
102 """calculates median"""
103 cList = copy(inList)
104 if len(cList) == 0:
105 median = float("nan")
106 elif len(cList) == 1:
107 return cList[0]
108 else:
109 cList.sort()
110 if len(cList)%2 == 1:
111 median = cList[len(cList)/2]
112 else:
113 median = (cList[len(cList)/2]+cList[(len(cList)/2)-1])/2.0
114 return (median)
115
116 def mean(inList):
117 return sum(inList) / float(len(inList))
118
119 def aliasRemap(inputMatrix, aliasMap, mode, combine_func):
120 """
121 Given a inputMatrix and an alias map, create a new genomic matrix
122 with the probes from the original matrix remapped to the connected aliases
123 from the map
124 """
125
126 if mode == "row":
127 i_am = {}
128 for label in aliasMap:
129 if inputMatrix.has_row(label):
130 for alias in aliasMap[label]:
131 if alias not in i_am:
132 i_am[alias] = {}
133 i_am[alias][label] = True
134
135 out = FloatMatrix()
136 out.init_blank( rows=i_am.keys(), cols=inputMatrix.get_cols() )
137 for a in i_am:
138 for sample in inputMatrix.get_cols():
139 o = []
140 for p in i_am[a]:
141 if inputMatrix.has_row(p):
142 o.append( inputMatrix.get_value( col_name=sample, row_name=p) )
143 if len(o):
144 out.set_value(col_name=sample, row_name=a, value=combine_func(o))
145 return out
146
147 if mode == "col":
148 i_am = {}
149 for label in aliasMap:
150 if inputMatrix.has_col(label):
151 for alias in aliasMap[label]:
152 if alias not in i_am:
153 i_am[alias] = {}
154 i_am[alias][label] = True
155
156 out = FloatMatrix()
157 out.init_blank( cols=i_am.keys(), rows=inputMatrix.get_rows() )
158 for a in i_am:
159 for r in inputMatrix.get_rows():
160 o = []
161 for label in i_am[a]:
162 if inputMatrix.has_col(label):
163 o.append( inputMatrix.get_value( row_name=r, col_name=label) )
164 if len(o):
165 out.set_value(col_name=a, row_name=r, value=combine_func(o))
166 return out
167
168
169 combine_map = {
170 "mean" : mean,
171 "median" : median,
172 "max" : max,
173 "min" : min
174 }
175
176 if __name__ == "__main__":
177 parser = ArgumentParser()
178
179 parser.add_argument("-m", "--mode", dest="mode", help="Row/Column mode", default="row")
180 parser.add_argument("-c", "--combine", dest="combine", help="Value Combine Method", default="mean")
181 parser.add_argument("-o", "--output", help="Output file", default=None)
182 parser.add_argument("inTab", help="Input tabular file", default=None)
183 parser.add_argument("aliasMap", help="Input alias map", default=None)
184
185 args = parser.parse_args()
186
187 mtx = FloatMatrix()
188 handle = open(args.inTab)
189 mtx.read(handle)
190 handle.close()
191
192 aliasMap = {}
193 handle = open(args.aliasMap)
194 for line in handle:
195 tmp = line.rstrip().split("\t")
196 if tmp[0] not in aliasMap:
197 aliasMap[tmp[0]] = {tmp[1] : True}
198 else:
199 aliasMap[tmp[0]][tmp[1]] = True
200 handle.close()
201
202 out = aliasRemap(mtx, aliasMap, args.mode, combine_map[args.combine])
203 if args.output is None:
204 handle = sys.stdout
205 else:
206 handle = open(args.output, "w")
207 out.write(handle)
208 handle.close()
209