annotate home/ubuntu/lefse_to_export/run_lefse.py @ 1:db64b6287cd6 draft

Modified datatypes
author george-weingart
date Wed, 20 Aug 2014 16:56:51 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
2
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
3 import os,sys,math,pickle
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
4 from lefse import *
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
5
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
6 def read_params(args):
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
7 parser = argparse.ArgumentParser(description='LEfSe 1.0')
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
8 parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
9 parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
10 help="the output file containing the data for the visualization module")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
11 parser.add_argument('-o',dest="out_text_file", metavar='str', type=str, default="",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
12 help="set the file for exporting the result (only concise textual form)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
13 parser.add_argument('-a',dest="anova_alpha", metavar='float', type=float, default=0.05,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
14 help="set the alpha value for the Anova test (default 0.05)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
15 parser.add_argument('-w',dest="wilcoxon_alpha", metavar='float', type=float, default=0.05,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
16 help="set the alpha value for the Wilcoxon test (default 0.05)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
17 parser.add_argument('-l',dest="lda_abs_th", metavar='float', type=float, default=2.0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
18 help="set the threshold on the absolute value of the logarithmic LDA score (default 2.0)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
19 parser.add_argument('--nlogs',dest="nlogs", metavar='int', type=int, default=3,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
20 help="max log ingluence of LDA coeff")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
21 parser.add_argument('--verbose',dest="verbose", metavar='int', choices=[0,1], type=int, default=0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
22 help="verbose execution (default 0)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
23 parser.add_argument('--wilc',dest="wilc", metavar='int', choices=[0,1], type=int, default=1,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
24 help="wheter to perform the Wicoxon step (default 1)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
25 parser.add_argument('-r',dest="rank_tec", metavar='str', choices=['lda','svm'], type=str, default='lda',
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
26 help="select LDA or SVM for effect size (default LDA)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
27 parser.add_argument('--svm_norm',dest="svm_norm", metavar='int', choices=[0,1], type=int, default=1,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
28 help="whether to normalize the data in [0,1] for SVM feature waiting (default 1 strongly suggested)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
29 parser.add_argument('-b',dest="n_boots", metavar='int', type=int, default=30,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
30 help="set the number of bootstrap iteration for LDA (default 30)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
31 parser.add_argument('-e',dest="only_same_subcl", metavar='int', type=int, default=0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
32 help="set whether perform the wilcoxon test only among the subclasses with the same name (default 0)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
33 parser.add_argument('-c',dest="curv", metavar='int', type=int, default=0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
34 help="set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] (default 0)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
35 parser.add_argument('-f',dest="f_boots", metavar='float', type=float, default=0.67,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
36 help="set the subsampling fraction value for each bootstrap iteration (default 0.66666)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
37 parser.add_argument('-s',dest="strict", choices=[0,1,2], type=int, default=0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
38 help="set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
39 # parser.add_argument('-m',dest="m_boots", type=int, default=5,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
40 # help="minimum cardinality of classes in each bootstrapping iteration (default 5)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
41 parser.add_argument('--min_c',dest="min_c", metavar='int', type=int, default=10,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
42 help="minimum number of samples per subclass for performing wilcoxon test (default 10)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
43 parser.add_argument('-t',dest="title", metavar='str', type=str, default="",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
44 help="set the title of the analysis (default input file without extension)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
45 parser.add_argument('-y',dest="multiclass_strat", choices=[0,1], type=int, default=0,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
46 help="(for multiclass tasks) set whether the test is performed in a one-against-one ( 1 - more strict!) or in a one-against-all setting ( 0 - less strict) (default 0)")
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
47 args = parser.parse_args()
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
48
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
49 params = vars(args)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
50 if params['title'] == "": params['title'] = params['input_file'].split("/")[-1].split('.')[0]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
51 return params
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
52
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
53
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
54
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
55 if __name__ == '__main__':
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
56 init()
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
57 params = read_params(sys.argv)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
58 feats,cls,class_sl,subclass_sl,class_hierarchy = load_data(params['input_file'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
59 kord,cls_means = get_class_means(class_sl,feats)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
60 wilcoxon_res = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
61 kw_n_ok = 0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
62 nf = 0
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
63 for feat_name,feat_values in feats.items():
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
64 if params['verbose']:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
65 print "Testing feature",str(nf),": ",feat_name,
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
66 nf += 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
67 kw_ok,pv = test_kw_r(cls,feat_values,params['anova_alpha'],sorted(cls.keys()))
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
68 if not kw_ok:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
69 if params['verbose']: print "\tkw ko"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
70 del feats[feat_name]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
71 wilcoxon_res[feat_name] = "-"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
72 continue
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
73 if params['verbose']: print "\tkw ok\t",
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
74
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
75 if not params['wilc']: continue
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
76 kw_n_ok += 1
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
77 res_wilcoxon_rep = test_rep_wilcoxon_r(subclass_sl,class_hierarchy,feat_values,params['wilcoxon_alpha'],params['multiclass_strat'],params['strict'],feat_name,params['min_c'],params['only_same_subcl'],params['curv'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
78 wilcoxon_res[feat_name] = str(pv) if res_wilcoxon_rep else "-"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
79 if not res_wilcoxon_rep:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
80 if params['verbose']: print "wilc ko"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
81 del feats[feat_name]
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
82 elif params['verbose']: print "wilc ok\t"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
83
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
84 if len(feats) > 0:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
85 print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
86 if params['lda_abs_th'] < 0.0:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
87 lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
88 else:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
89 if params['rank_tec'] == 'lda': lda_res,lda_res_th = test_lda_r(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0000000001,params['nlogs'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
90 elif params['rank_tec'] == 'svm': lda_res,lda_res_th = test_svm(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0,params['svm_norm'])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
91 else: lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
92 else:
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
93 print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
94 print "No features with significant differences between the two classes"
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
95 lda_res,lda_res_th = {},{}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
96 outres = {}
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
97 outres['lda_res_th'] = lda_res_th
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
98 outres['lda_res'] = lda_res
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
99 outres['cls_means'] = cls_means
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
100 outres['cls_means_kord'] = kord
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
101 outres['wilcox_res'] = wilcoxon_res
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
102 print "Number of discriminative features with abs LDA score >",params['lda_abs_th'],":",len(lda_res_th)
db64b6287cd6 Modified datatypes
george-weingart
parents:
diff changeset
103 save_res(outres,params["output_file"])