# HG changeset patch # User anmoljh # Date 1527844579 14400 # Node ID b4d2524e79ab9c4d6bbf4b1030bdc5de62d8cd61 planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272 diff -r 000000000000 -r b4d2524e79ab feature_selection.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selection.R Fri Jun 01 05:16:19 2018 -0400 @@ -0,0 +1,164 @@ +args <- commandArgs(T) + +arg1 <- args[1] +arg2 <- args[2] +arg3 <- args[3] +arg4 <- args[4] +arg5 <- args[5] +arg6 <- args[6] +arg7 <- args[7] +arg8 <- args[8] +arg9 <- args[9] +arg10 <- args[10] +library(caret) +library(doMC) +load(arg1) + +#RAWDATA <- dataX +#RAWDATA$outcome <- dataY + + +########################### +Smpling <- arg9 + +if(Smpling=="downsampling") +{ +dwnsmpl <- downSample(dataX,dataY) +RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1] +RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)] +dataX <- RAWDATA[,1:length(dwnsmpl)-1] +dataY <- RAWDATA[,"outcome"] +remove("dwnsmpl") +}else if(Smpling=="upsampling"){ +upsmpl <- upSample(dataX,dataY) +RAWDATA <- upsmpl[,1:length(upsmpl)-1] +RAWDATA$outcome <- upsmpl[,length(upsmpl)] +dataX <- RAWDATA[,1:length(upsmpl)-1] +dataY <- RAWDATA[,"outcome"] +remove("upsmpl") +}else { +RAWDATA <- dataX +RAWDATA$outcome <- dataY +} + + + + +########################## + + +rawData <- dataX +predictorNames <- names(rawData) + +isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) +if(any(!isNum)) stop("all predictors in rawData should be numeric") + +colRate <- apply(rawData[, predictorNames, drop = FALSE], + 2, function(x) mean(is.na(x))) +colExclude <- colRate > 0.1 + if(any(colExclude)){ + predictorNames <- predictorNames[-which(colExclude)] + rawData <- RAWDATA[, c(predictorNames,"outcome")] + } else { + rawData <- RAWDATA + } + rowRate <- apply(rawData[, predictorNames, drop = FALSE], + 1, function(x) mean(is.na(x))) + + +rowExclude <- rowRate > 0 + if(any(rowExclude)){ + rawData <- rawData[!rowExclude, ] + ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], + ##1, function(x) mean(is.na(x))) + +############################################################################ + + +############################################################################### + } else { + rawData <- rawData[complete.cases(rawData),] + + } + +set.seed(2) + +#print(dim(dataX)) +#print(dim(rawData)) +#print(length(dataY)) + +nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) + if(length(nzv) > 0) { + #nzvVars <- names(rawData)[nzv] + rawData <- rawData[,-nzv] + #rawData$outcome <- dataY + } + +predictorNames <- names(rawData)[names(rawData) != "outcome"] + +dx <- rawData[,1:length(rawData)-1] +dy <- rawData[,length(rawData)] +corrThresh <- as.numeric(arg8) +highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) +dx <- dx[, -highCorr] +subsets <- seq(1,length(dx),by=5) +normalization <- preProcess(dx) +dx <- predict(normalization, dx) +dx <- as.data.frame(dx) + +if (arg4 == "lmFuncs"){ +ctrl1 <- rfeControl(functions = lmFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} else if(arg4 == "rfFuncs"){ +ctrl1 <- rfeControl(functions = rfFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else if (arg4 == "treebagFuncs"){ +ctrl1 <- rfeControl(functions = treebagFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +}else { + +ctrl1 <- rfeControl(functions = nbFuncs, + method = arg5 , + repeats = as.numeric(arg6), + number = as.numeric(arg7), + verbose = FALSE) +} + + + +if (as.numeric(arg10) == 1){ +Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) + +pred11 <- predictors(Profile) +save(Profile,file=arg2) +dataX <- rawData[,pred11] +dataY <- rawData$outcome + +save(dataX,dataY,file=arg3) +rm(dataX) +rm(dataY) +} else if (as.numeric(arg10) > 1){ +registerDoMC(cores = as.numeric(arg10)) + +Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) + +pred11 <- predictors(Profile) +save(Profile,file=arg2) +dataX <- rawData[,pred11] +dataY <- rawData$outcome + +save(dataX,dataY,file=arg3) +rm(dataX) +rm(dataY) +} else { stop("something went wrong. please see the parameters")} + + diff -r 000000000000 -r b4d2524e79ab feature_selection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selection.xml Fri Jun 01 05:16:19 2018 -0400 @@ -0,0 +1,96 @@ + + + This tool selects best features which are used as input for model building. + + + + R + carettools + + + + + + +feature_selection.R $input $profile $finalset $function1 $resampling $repeat $number $corcutoff $SAMPLING \${GALAXY_SLOTS:-1} >/dev/null 2>&1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**RFE based feature selection for classification and regression** + +Input file must be RData file obtained by converting csv file in to RData. + +output "Selected_feature.RData" file used for model building purpose.While profile + +represents feature selection model. + +Correlation cutoff value is desired for choosing independent variables For example + +Cutoff value = 0.8 removes all descriptors sharing equal or highet correlation values. + +User may choose varous resampling methods in combination with repeats and times of resample. + + + + + + diff -r 000000000000 -r b4d2524e79ab tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Fri Jun 01 05:16:19 2018 -0400 @@ -0,0 +1,9 @@ + + + + + + + + +