annotate home/ubuntu/lefse_to_export/format_input.py @ 1:db64b6287cd6 draft

Modified datatypes
author george-weingart
date Wed, 20 Aug 2014 16:56:51 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
2
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
3 import sys,os,argparse,pickle,re
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
4
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
5 def read_input_file(inp_file):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
6 with open(inp_file) as inp:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
7 return [[v.strip() for v in line.strip().split("\t")] for line in inp.readlines()]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
8
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
9 def transpose(data):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
10 return zip(*data)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
11
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
12 def read_params(args):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
13 parser = argparse.ArgumentParser(description='LEfSe formatting modules')
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
14 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
15 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
16 help="the output file containing the data for LEfSe")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
17 parser.add_argument('--output_table', type=str, required=False, default="",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
18 help="the formatted table in txt format")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
19 parser.add_argument('-f',dest="feats_dir", choices=["c","r"], type=str, default="r",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
20 help="set whether the features are on rows (default) or on columns")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
21 parser.add_argument('-c',dest="class", metavar="[1..n_feats]", type=int, default=1,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
22 help="set which feature use as class (default 1)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
23 parser.add_argument('-s',dest="subclass", metavar="[1..n_feats]", type=int, default=None,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
24 help="set which feature use as subclass (default -1 meaning no subclass)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
25 parser.add_argument('-o',dest="norm_v", metavar="float", type=float, default=-1.0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
26 help="set the normalization value (default -1.0 meaning no normalization)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
27 parser.add_argument('-u',dest="subject", metavar="[1..n_feats]", type=int, default=None,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
28 help="set which feature use as subject (default -1 meaning no subject)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
29 parser.add_argument('-m',dest="missing_p", choices=["f","s"], type=str, default="d",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
30 help="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
31 parser.add_argument('-n',dest="subcl_min_card", metavar="int", type=int, default=10,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
32 help="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
33
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
34 args = parser.parse_args()
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
35
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
36 return vars(args)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
37
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
38 def remove_missing(data,roc):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
39 if roc == "c": data = transpose(data)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
40 max_len = max([len(r) for r in data])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
41 to_rem = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
42 for i,r in enumerate(data):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
43 if len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
44 if len(to_rem):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
45 for i in to_rem.reverse():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
46 data.pop(i)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
47 if roc == "c": return transpose(data)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
48 return data
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
49
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
50
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
51 def sort_by_cl(data,n,c,s,u):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
52 def sort_lines1(a,b):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
53 return int(a[c] > b[c])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
54 def sort_lines2u(a,b):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
55 if a[c] != b[c]: return int(a[c] > b[c])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
56 return int(a[u] > b[u])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
57 def sort_lines2s(a,b):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
58 if a[c] != b[c]: return int(a[c] > b[c])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
59 return int(a[s] > b[s])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
60 def sort_lines3(a,b):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
61 if a[c] != b[c]: return int(a[c] > b[c])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
62 if a[s] != b[s]: return int(a[s] > b[s])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
63 return int(a[u] > b[u])*2-1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
64 if n == 3: data.sort(sort_lines3)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
65 if n == 2:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
66 if s is None:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
67 data.sort(sort_lines2u)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
68 else:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
69 data.sort(sort_lines2s)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
70 if n == 1: data.sort(sort_lines1)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
71 return data
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
72
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
73 def group_small_subclasses(cls,min_subcl):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
74 last = ""
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
75 n = 0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
76 repl = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
77 dd = [list(cls['class']),list(cls['subclass'])]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
78 for d in dd:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
79 if d[1] != last:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
80 if n < min_subcl and last != "":
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
81 repl.append(d[1])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
82 last = d[1]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
83 n = 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
84 for i,d in enumerate(dd):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
85 if d[1] in repl: dd[i][1] = "other"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
86 dd[i][1] = str(dd[i][0])+"_"+str(dd[i][1])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
87 cls['class'] = dd[0]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
88 cls['subclass'] = dd[1]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
89 return cls
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
90
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
91 def get_class_slices(data):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
92 previous_class = data[0][0]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
93 previous_subclass = data[0][1]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
94 subclass_slices = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
95 class_slices = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
96 last_cl = 0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
97 last_subcl = 0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
98 class_hierarchy = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
99 subcls = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
100 for i,d in enumerate(data):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
101 if d[1] != previous_subclass:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
102 subclass_slices.append((previous_subclass,(last_subcl,i)))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
103 last_subcl = i
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
104 subcls.append(previous_subclass)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
105 if d[0] != previous_class:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
106 class_slices.append((previous_class,(last_cl,i)))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
107 class_hierarchy.append((previous_class,subcls))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
108 subcls = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
109 last_cl = i
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
110 previous_subclass = d[1]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
111 previous_class = d[0]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
112 subclass_slices.append((previous_subclass,(last_subcl,i+1)))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
113 subcls.append(previous_subclass)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
114 class_slices.append((previous_class,(last_cl,i+1)))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
115 class_hierarchy.append((previous_class,subcls))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
116 return dict(class_slices), dict(subclass_slices), dict(class_hierarchy)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
117
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
118 def numerical_values(feats,norm):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
119 mm = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
120 for k,v in feats.items():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
121 feats[k] = [float(val) for val in v]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
122 if norm < 0.0: return feats
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
123 tr = zip(*(feats.values()))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
124 mul = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
125 fk = feats.keys()
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
126 hie = True if sum([k.count(".") for k in fk]) > len(fk) else False
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
127 for i in range(len(feats.values()[0])):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
128 if hie: mul.append(sum([t for j,t in enumerate(tr[i]) if fk[j].count(".") < 1 ]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
129 else: mul.append(sum(tr[i]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
130 if hie and sum(mul) == 0:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
131 mul = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
132 for i in range(len(feats.values()[0])):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
133 mul.append(sum(tr[i]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
134 for i,m in enumerate(mul):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
135 if m == 0: mul[i] = 0.0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
136 else: mul[i] = float(norm) / m
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
137 for k,v in feats.items():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
138 feats[k] = [val*mul[i] for i,val in enumerate(v)]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
139 return feats
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
140
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
141 def add_missing_levels2(ff):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
142
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
143 if sum( [f.count(".") for f in ff] ) < 1: return ff
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
144
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
145 dn = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
146
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
147 added = True
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
148 while added:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
149 added = False
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
150 for f in ff:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
151 lev = f.count(".")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
152 if lev == 0: continue
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
153 if lev not in dn: dn[lev] = [f]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
154 else: dn[lev].append(f)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
155 for fn in sorted(dn,reverse=True):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
156 for f in dn[fn]:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
157 fc = ".".join(f.split('.')[:-1])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
158 if fc not in ff:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
159 ab_all = [ff[fg] for fg in ff if (fg.count(".") == 0 and fg == fc) or (fg.count(".") > 0 and fc == ".".join(fg.split('.')[:-1]))]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
160 ab =[]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
161 for l in [f for f in zip(*ab_all)]:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
162 ab.append(sum([float(ll) for ll in l]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
163 ff[fc] = ab
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
164 added = True
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
165 if added:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
166 break
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
167
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
168 return ff
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
169
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
170
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
171 def add_missing_levels(ff):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
172 if sum( [f.count(".") for f in ff] ) < 1: return ff
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
173
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
174 clades2leaves = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
175 for f in ff:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
176 fs = f.split(".")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
177 if len(fs) < 2:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
178 continue
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
179 for l in range(len(fs)):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
180 n = ".".join( fs[:l] )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
181 if n in clades2leaves:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
182 clades2leaves[n].append( f )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
183 else:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
184 clades2leaves[n] = [f]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
185 for k,v in clades2leaves.items():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
186 if k and k not in ff:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
187 ff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
188 return ff
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
189
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
190
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
191
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
192 def modify_feature_names(fn):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
193 ret = fn
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
194
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
195 for v in [' ',r'\$',r'\@',r'#',r'%',r'\^',r'\&',r'\*',r'\"',r'\'']:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
196 ret = [re.sub(v,"",f) for f in ret]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
197 for v in ["/",r'\(',r'\)',r'-',r'\+',r'=',r'{',r'}',r'\[',r'\]',
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
198 r',',r'\.',r';',r':',r'\?',r'\<',r'\>',r'\.',r'\,']:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
199 ret = [re.sub(v,"_",f) for f in ret]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
200 for v in ["\|"]:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
201 ret = [re.sub(v,".",f) for f in ret]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
202
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
203 ret2 = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
204 for r in ret:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
205 if r[0] in ['0','1','2','3','4','5','6','7','8','9']:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
206 ret2.append("f_"+r)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
207 else: ret2.append(r)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
208
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
209 return ret2
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
210
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
211
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
212 def rename_same_subcl(cl,subcl):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
213 toc = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
214 for sc in set(subcl):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
215 if len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
216 toc.append(sc)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
217 new_subcl = []
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
218 for i,sc in enumerate(subcl):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
219 if sc in toc: new_subcl.append(cl[i]+"_"+sc)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
220 else: new_subcl.append(sc)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
221 return new_subcl
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
222
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
223 if __name__ == '__main__':
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
224 params = read_params(sys.argv)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
225
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
226 if type(params['subclass']) is int and int(params['subclass']) < 1:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
227 params['subclass'] = None
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
228 if type(params['subject']) is int and int(params['subject']) < 1:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
229 params['subject'] = None
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
230 data = read_input_file(sys.argv[1])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
231
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
232 if params['feats_dir'] == "c":
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
233 data = transpose(data)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
234
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
235 ncl = 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
236 if not params['subclass'] is None: ncl += 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
237 if not params['subject'] is None: ncl += 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
238
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
239 first_line = zip(*data)[0]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
240
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
241 first_line = modify_feature_names(list(first_line))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
242
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
243 data = zip( first_line,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
244 *sort_by_cl(zip(*data)[1:],
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
245 ncl,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
246 params['class']-1,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
247 params['subclass']-1 if not params['subclass'] is None else None,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
248 params['subject']-1 if not params['subject'] is None else None))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
249 # data.insert(0,first_line)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
250 # data = remove_missing(data,params['missing_p'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
251 cls = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
252
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
253 cls_i = [('class',params['class']-1)]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
254 if params['subclass'] > 0: cls_i.append(('subclass',params['subclass']-1))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
255 if params['subject'] > 0: cls_i.append(('subject',params['subject']-1))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
256 cls_i.sort(lambda x, y: -cmp(x[1],y[1]))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
257 for v in cls_i: cls[v[0]] = data.pop(v[1])[1:]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
258 if not params['subclass'] > 0: cls['subclass'] = [str(cl)+"_subcl" for cl in cls['class']]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
259
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
260 cls['subclass'] = rename_same_subcl(cls['class'],cls['subclass'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
261 # if 'subclass' in cls.keys(): cls = group_small_subclasses(cls,params['subcl_min_card'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
262 class_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
263
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
264 feats = dict([(d[0],d[1:]) for d in data])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
265
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
266 feats = add_missing_levels(feats)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
267
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
268 feats = numerical_values(feats,params['norm_v'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
269 out = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
270 out['feats'] = feats
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
271 out['norm'] = params['norm_v']
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
272 out['cls'] = cls
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
273 out['class_sl'] = class_sl
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
274 out['subclass_sl'] = subclass_sl
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
275 out['class_hierarchy'] = class_hierarchy
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
276
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
277 if params['output_table']:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
278 with open( params['output_table'], "w") as outf:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
279 if 'class' in cls: outf.write( "\t".join(list(["class"])+list(cls['class'])) + "\n" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
280 if 'subclass' in cls: outf.write( "\t".join(list(["subclass"])+list(cls['subclass'])) + "\n" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
281 if 'subject' in cls: outf.write( "\t".join(list(["subject"])+list(cls['subject'])) + "\n" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
282 for k,v in out['feats'].items(): outf.write( "\t".join([k]+[str(vv) for vv in v]) + "\n" )
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
283
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
284 with open(params['output_file'], 'wb') as back_file:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
285 pickle.dump(out,back_file)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
286