comparison format_input.py @ 0:e7cd19afda2e draft

Lefse
author george-weingart
date Tue, 13 May 2014 21:57:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e7cd19afda2e
1 #!/usr/bin/env python
2
3 import sys,os,argparse,pickle,re
4
5 def read_input_file(inp_file):
6 with open(inp_file) as inp:
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
8
9 def transpose(data):
10 return zip(*data)
11
12 def read_params(args):
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
16 help="the output file containing the data for LEfSe")
17 parser.add_argument('--output_table', type=str, required=False, default="",
18 help="the formatted table in txt format")
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
20 help="set whether the features are on rows (default) or on columns")
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
22 help="set which feature use as class (default 1)")
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
24 help="set which feature use as subclass (default -1 meaning no subclass)")
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
26 help="set the normalization value (default -1.0 meaning no normalization)")
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
28 help="set which feature use as subject (default -1 meaning no subject)")
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
33
34 args = parser.parse_args()
35
36 return vars(args)
37
38 def remove_missing(data,roc):
39 if roc == "c": data = transpose(data)
40 max_len = max([len(r) for r in data])
41 to_rem = []
42 for i,r in enumerate(data):
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
44 if len(to_rem):
45 for i in to_rem.reverse():
46 data.pop(i)
47 if roc == "c": return transpose(data)
48 return data
49
50
51 def sort_by_cl(data,n,c,s,u):
52 def sort_lines1(a,b):
53 return int(a[c] > b[c])*2-1
54 def sort_lines2u(a,b):
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1
56 return int(a[u] > b[u])*2-1
57 def sort_lines2s(a,b):
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1
59 return int(a[s] > b[s])*2-1
60 def sort_lines3(a,b):
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1
63 return int(a[u] > b[u])*2-1
64 if n == 3: data.sort(sort_lines3)
65 if n == 2:
66 if s is None:
67 data.sort(sort_lines2u)
68 else:
69 data.sort(sort_lines2s)
70 if n == 1: data.sort(sort_lines1)
71 return data
72
73 def group_small_subclasses(cls,min_subcl):
74 last = ""
75 n = 0
76 repl = []
77 dd = [list(cls['class']),list(cls['subclass'])]
78 for d in dd:
79 if d[1] != last:
80 if n < min_subcl and last != "":
81 repl.append(d[1])
82 last = d[1]
83 n = 1
84 for i,d in enumerate(dd):
85 if d[1] in repl: dd[i][1] = "other"
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
87 cls['class'] = dd[0]
88 cls['subclass'] = dd[1]
89 return cls
90
91 def get_class_slices(data):
92 previous_class = data[0][0]
93 previous_subclass = data[0][1]
94 subclass_slices = []
95 class_slices = []
96 last_cl = 0
97 last_subcl = 0
98 class_hierarchy = []
99 subcls = []
100 for i,d in enumerate(data):
101 if d[1] != previous_subclass:
102 subclass_slices.append((previous_subclass,(last_subcl,i)))
103 last_subcl = i
104 subcls.append(previous_subclass)
105 if d[0] != previous_class:
106 class_slices.append((previous_class,(last_cl,i)))
107 class_hierarchy.append((previous_class,subcls))
108 subcls = []
109 last_cl = i
110 previous_subclass = d[1]
111 previous_class = d[0]
112 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
113 subcls.append(previous_subclass)
114 class_slices.append((previous_class,(last_cl,i+1)))
115 class_hierarchy.append((previous_class,subcls))
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
117
118 def numerical_values(feats,norm):
119 mm = []
120 for k,v in feats.items():
121 feats[k] = [float(val) for val in v]
122 if norm < 0.0: return feats
123 tr = zip(*(feats.values()))
124 mul = []
125 fk = feats.keys()
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
127 for i in range(len(feats.values()[0])):
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
129 else: mul.append(sum(tr[i]))
130 if hie and sum(mul) == 0:
131 mul = []
132 for i in range(len(feats.values()[0])):
133 mul.append(sum(tr[i]))
134 for i,m in enumerate(mul):
135 if m == 0: mul[i] = 0.0
136 else: mul[i] = float(norm) / m
137 for k,v in feats.items():
138 feats[k] = [val*mul[i] for i,val in enumerate(v)]
139 return feats
140
141 def add_missing_levels2(ff):
142
143 if sum( [f.count(".") for f in ff] ) < 1: return ff
144
145 dn = {}
146
147 added = True
148 while added:
149 added = False
150 for f in ff:
151 lev = f.count(".")
152 if lev == 0: continue
153 if lev not in dn: dn[lev] = [f]
154 else: dn[lev].append(f)
155 for fn in sorted(dn,reverse=True):
156 for f in dn[fn]:
157 fc = ".".join(f.split('.')[:-1])
158 if fc not in ff:
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
160 ab =[]
161 for l in [f for f in zip(*ab_all)]:
162 ab.append(sum([float(ll) for ll in l]))
163 ff[fc] = ab
164 added = True
165 if added:
166 break
167
168 return ff
169
170
171 def add_missing_levels(ff):
172 if sum( [f.count(".") for f in ff] ) < 1: return ff
173
174 clades2leaves = {}
175 for f in ff:
176 fs = f.split(".")
177 if len(fs) < 2:
178 continue
179 for l in range(len(fs)):
180 n = ".".join( fs[:l] )
181 if n in clades2leaves:
182 clades2leaves[n].append( f )
183 else:
184 clades2leaves[n] = [f]
185 for k,v in clades2leaves.items():
186 if k and k not in ff:
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
188 return ff
189
190
191
192 def modify_feature_names(fn):
193 ret = fn
194
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
196 ret = [re.sub(v,"",f) for f in ret]
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
199 ret = [re.sub(v,"_",f) for f in ret]
200 for v in ["\|"]:
201 ret = [re.sub(v,".",f) for f in ret]
202
203 ret2 = []
204 for r in ret:
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
206 ret2.append("f_"+r)
207 else: ret2.append(r)
208
209 return ret2
210
211
212 def rename_same_subcl(cl,subcl):
213 toc = []
214 for sc in set(subcl):
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
216 toc.append(sc)
217 new_subcl = []
218 for i,sc in enumerate(subcl):
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
220 else: new_subcl.append(sc)
221 return new_subcl
222
223 if __name__ == '__main__':
224 params = read_params(sys.argv)
225
226 if type(params['subclass']) is int and int(params['subclass']) < 1:
227 params['subclass'] = None
228 if type(params['subject']) is int and int(params['subject']) < 1:
229 params['subject'] = None
230 data = read_input_file(sys.argv[1])
231
232 if params['feats_dir'] == "c":
233 data = transpose(data)
234
235 ncl = 1
236 if not params['subclass'] is None: ncl += 1
237 if not params['subject'] is None: ncl += 1
238
239 first_line = zip(*data)[0]
240
241 first_line = modify_feature_names(list(first_line))
242
243 data = zip( first_line,
244 *sort_by_cl(zip(*data)[1:],
245 ncl,
246 params['class']-1,
247 params['subclass']-1 if not params['subclass'] is None else None,
248 params['subject']-1 if not params['subject'] is None else None))
249 # data.insert(0,first_line)
250 # data = remove_missing(data,params['missing_p'])
251 cls = {}
252
253 cls_i = [('class',params['class']-1)]
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
259
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
263
264 feats = dict([(d[0],d[1:]) for d in data])
265
266 feats = add_missing_levels(feats)
267
268 feats = numerical_values(feats,params['norm_v'])
269 out = {}
270 out['feats'] = feats
271 out['norm'] = params['norm_v']
272 out['cls'] = cls
273 out['class_sl'] = class_sl
274 out['subclass_sl'] = subclass_sl
275 out['class_hierarchy'] = class_hierarchy
276
277 if params['output_table']:
278 with open( params['output_table'], "w") as outf:
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
283
284 with open(params['output_file'], 'wb') as back_file:
285 pickle.dump(out,back_file)
286