comparison feature_selection.R @ 0:b4d2524e79ab draft

planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272
author anmoljh
date Fri, 01 Jun 2018 05:16:19 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b4d2524e79ab
1 args <- commandArgs(T)
2
3 arg1 <- args[1]
4 arg2 <- args[2]
5 arg3 <- args[3]
6 arg4 <- args[4]
7 arg5 <- args[5]
8 arg6 <- args[6]
9 arg7 <- args[7]
10 arg8 <- args[8]
11 arg9 <- args[9]
12 arg10 <- args[10]
13 library(caret)
14 library(doMC)
15 load(arg1)
16
17 #RAWDATA <- dataX
18 #RAWDATA$outcome <- dataY
19
20
21 ###########################
22 Smpling <- arg9
23
24 if(Smpling=="downsampling")
25 {
26 dwnsmpl <- downSample(dataX,dataY)
27 RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
28 RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
29 dataX <- RAWDATA[,1:length(dwnsmpl)-1]
30 dataY <- RAWDATA[,"outcome"]
31 remove("dwnsmpl")
32 }else if(Smpling=="upsampling"){
33 upsmpl <- upSample(dataX,dataY)
34 RAWDATA <- upsmpl[,1:length(upsmpl)-1]
35 RAWDATA$outcome <- upsmpl[,length(upsmpl)]
36 dataX <- RAWDATA[,1:length(upsmpl)-1]
37 dataY <- RAWDATA[,"outcome"]
38 remove("upsmpl")
39 }else {
40 RAWDATA <- dataX
41 RAWDATA$outcome <- dataY
42 }
43
44
45
46
47 ##########################
48
49
50 rawData <- dataX
51 predictorNames <- names(rawData)
52
53 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
54 if(any(!isNum)) stop("all predictors in rawData should be numeric")
55
56 colRate <- apply(rawData[, predictorNames, drop = FALSE],
57 2, function(x) mean(is.na(x)))
58 colExclude <- colRate > 0.1
59 if(any(colExclude)){
60 predictorNames <- predictorNames[-which(colExclude)]
61 rawData <- RAWDATA[, c(predictorNames,"outcome")]
62 } else {
63 rawData <- RAWDATA
64 }
65 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
66 1, function(x) mean(is.na(x)))
67
68
69 rowExclude <- rowRate > 0
70 if(any(rowExclude)){
71 rawData <- rawData[!rowExclude, ]
72 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
73 ##1, function(x) mean(is.na(x)))
74
75 ############################################################################
76
77
78 ###############################################################################
79 } else {
80 rawData <- rawData[complete.cases(rawData),]
81
82 }
83
84 set.seed(2)
85
86 #print(dim(dataX))
87 #print(dim(rawData))
88 #print(length(dataY))
89
90 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
91 if(length(nzv) > 0) {
92 #nzvVars <- names(rawData)[nzv]
93 rawData <- rawData[,-nzv]
94 #rawData$outcome <- dataY
95 }
96
97 predictorNames <- names(rawData)[names(rawData) != "outcome"]
98
99 dx <- rawData[,1:length(rawData)-1]
100 dy <- rawData[,length(rawData)]
101 corrThresh <- as.numeric(arg8)
102 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
103 dx <- dx[, -highCorr]
104 subsets <- seq(1,length(dx),by=5)
105 normalization <- preProcess(dx)
106 dx <- predict(normalization, dx)
107 dx <- as.data.frame(dx)
108
109 if (arg4 == "lmFuncs"){
110 ctrl1 <- rfeControl(functions = lmFuncs,
111 method = arg5 ,
112 repeats = as.numeric(arg6),
113 number = as.numeric(arg7),
114 verbose = FALSE)
115 } else if(arg4 == "rfFuncs"){
116 ctrl1 <- rfeControl(functions = rfFuncs,
117 method = arg5 ,
118 repeats = as.numeric(arg6),
119 number = as.numeric(arg7),
120 verbose = FALSE)
121 }else if (arg4 == "treebagFuncs"){
122 ctrl1 <- rfeControl(functions = treebagFuncs,
123 method = arg5 ,
124 repeats = as.numeric(arg6),
125 number = as.numeric(arg7),
126 verbose = FALSE)
127 }else {
128
129 ctrl1 <- rfeControl(functions = nbFuncs,
130 method = arg5 ,
131 repeats = as.numeric(arg6),
132 number = as.numeric(arg7),
133 verbose = FALSE)
134 }
135
136
137
138 if (as.numeric(arg10) == 1){
139 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
140
141 pred11 <- predictors(Profile)
142 save(Profile,file=arg2)
143 dataX <- rawData[,pred11]
144 dataY <- rawData$outcome
145
146 save(dataX,dataY,file=arg3)
147 rm(dataX)
148 rm(dataY)
149 } else if (as.numeric(arg10) > 1){
150 registerDoMC(cores = as.numeric(arg10))
151
152 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
153
154 pred11 <- predictors(Profile)
155 save(Profile,file=arg2)
156 dataX <- rawData[,pred11]
157 dataY <- rawData$outcome
158
159 save(dataX,dataY,file=arg3)
160 rm(dataX)
161 rm(dataY)
162 } else { stop("something went wrong. please see the parameters")}
163
164