diff FunctExeCalcGLMGalaxy.r @ 4:44b9069775ca draft

"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 3df1978827a91be30e815dee2ed83a92862d1b1c"
author ecology
date Sun, 22 Nov 2020 18:40:23 +0000
parents f0dc3958e65d
children 8db02073b671
line wrap: on
line diff
--- a/FunctExeCalcGLMGalaxy.r	Mon Jul 27 09:52:47 2020 -0400
+++ b/FunctExeCalcGLMGalaxy.r	Sun Nov 22 18:40:23 2020 +0000
@@ -1,4 +1,4 @@
-#Rscript 
+#Rscript
 
 #####################################################################################################################
 #####################################################################################################################
@@ -8,270 +8,253 @@
 
 ###################### Packages
 suppressMessages(library(multcomp))
+suppressMessages(library(DHARMa))
 suppressMessages(library(glmmTMB)) ###Version: 0.2.3
-suppressMessages(library(gap)) 
+suppressMessages(library(gap))
+
 
 ###################### Load arguments and declaring variables
 
-args = commandArgs(trailingOnly=TRUE)
-#options(encoding = "UTF-8")
+args <- commandArgs(trailingOnly = TRUE)
 
 if (length(args) < 10) {
-    stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call.=FALSE) #si pas d'arguments -> affiche erreur et quitte / if no args -> error and exit1
+    stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call. = FALSE) # if no args -> error and exit1
 
 } else {
-    Importdata <- args[1] ###### file name : metrics table
-    ImportUnitobs <- args[2] ###### file name : unitobs informations
+    import_data <- args[1] ###### file name : metrics table
+    import_unitobs <- args[2] ###### file name : unitobs informations
     colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM
-    listFact <- strsplit(args [4],",")[[1]] ###### Selected response factors for GLM
-    listRand <- strsplit(args [5],",")[[1]] ###### Selected randomized response factors for GLM
-    colFactAna <- args[6] ####### (optional) Selected splitting factors for GLMs
-    Distrib <- args[7] ###### (optional) Selected distribution for GLM 
-    log <- args[8] ###### (Optional) Log on interest metric ?
+    list_fact <- strsplit(args [4], ",")[[1]] ###### Selected response factors for GLM
+    list_rand <- strsplit(args [5], ",")[[1]] ###### Selected randomized response factors for GLM
+    col_fact_ana <- args[6] ####### (optional) Selected splitting factors for GLMs
+    distrib <- args[7] ###### (optional) Selected distribution for GLM
+    glm_out <- args[8] ###### (Optional) GLM object as Rdata output ?
     aggreg <- args[9] ###### Aggregation level of the data table
     source(args[10]) ###### Import functions
 
 }
-#### Data must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number")
+
+
+#### d_ata must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number")
 
 
-#Import des données / Import data 
-obs<- read.table(Importdata,sep="\t",dec=".",header=TRUE,encoding="UTF-8") #
-obs[obs == -999] <- NA 
+#Import des données / Import data
+obs <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
+obs[obs == -999] <- NA
 metric <- colnames(obs)[colmetric]
-tabUnitobs <- read.table(ImportUnitobs,sep="\t",dec=".",header=TRUE,encoding="UTF-8")
-tabUnitobs[tabUnitobs == -999] <- NA 
+tab_unitobs <- read.table(import_unitobs, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8")
+tab_unitobs[tab_unitobs == -999] <- NA
 
-if (colFactAna != "None")
-{
-    FactAna <- colnames(tabUnitobs)[as.numeric(colFactAna)]
-    if (class(tabUnitobs[FactAna]) == "numeric" || FactAna == "observation.unit"){stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")}
+if (col_fact_ana != "None") {
+    fact_ana <- colnames(tab_unitobs)[as.numeric(col_fact_ana)]
+    if (class(tab_unitobs[fact_ana]) == "numeric" || fact_ana == "observation.unit") {
+        stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")
+    }
 }else{
-    FactAna <- colFactAna
+    fact_ana <- col_fact_ana
+}
+
+vars_data1 <- NULL
+err_msg_data1 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- numeric or integer metric\n"
+check_file(obs, err_msg_data1, vars_data1, 2)
+
+vars_data2 <- c("observation.unit", list_fact, list_rand)
+err_msg_data2 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n"
+check_file(tab_unitobs, err_msg_data2, vars_data2[vars_data2 != "None"], 2)
+
+if (all(c(list_fact, list_rand) == "None")) {
+    stop("GLM needs to have at least one response variable.")
+}
+
+if (list_fact[1] == "None" || all(is.element(list_fact, list_rand))) {
+     stop("GLM can't have only random effects.")
 }
 
 
-#factors <- fact.det.f(Obs=obs)
-
-vars_data1<- NULL
-err_msg_data1<-"The input metrics dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- numeric or integer metric\n"
-check_file(obs,err_msg_data1,vars_data1,2)
-
-vars_data2 <- c(listFact,listRand)
-err_msg_data2<-"The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n"
-check_file(tabUnitobs,err_msg_data2,vars_data2[vars_data2 != "None"],2)
-
 ####################################################################################################
-########## Computing Generalized Linear Model ## Function : modeleLineaireWP2.unitobs.f ############
+######### Computing Generalized Linear Model ## Function : glm_community ############
 ####################################################################################################
 
-modeleLineaireWP2.unitobs.f <- function(metrique, listFact, listRand, FactAna, Distrib, log=FALSE, tabMetrics, tableMetrique, tabUnitobs, unitobs="observation.unit", nbName="number")
-{
-    ## Purpose: Monitoring steps for GLM on unitobs
+glm_community <- function(metrique, list_fact, list_rand, fact_ana, distrib, tab_metrics, tab_metrique, tab_unitobs, unitobs = "observation.unit", nb_name = "number") {
+    ## Purpose: Monitoring steps for GLM on community data
     ## ----------------------------------------------------------------------
     ## Arguments: metrique : selected metric
-    ##            listFact : Factors for GLM
-    ##            listRand : Random factors for GLM
-    ##            factAna : Separation factor for GLMs
-    ##            Distrib : selected distribution for model
-    ##            log : log transformation on data ? boolean
-    ##            tabMetrics : data table metrics
-    ##            tableMetrique : data table's name
-    ##            tabUnitobs : data table unitobs
+    ##            list_fact : Factors for GLM
+    ##            list_rand : Random factors for GLM
+    ##            fact_ana : Separation factor for GLMs
+    ##            distrib : selected distribution for model
+    ##            tab_metrics : data table metrics
+    ##            tab_metrique : data table's name
+    ##            tab_unitobs : data table unitobs
     ## ----------------------------------------------------------------------
     ## Author: Yves Reecht, Date: 18 août 2010, 15:59 modified by Coline ROYAUX 04 june 2020
 
-    tmpData <- tabMetrics
+    tmpd_ata <- tab_metrics
 
-    if (listRand[1] != "None")
-    {
-        if (all(is.element(listFact,listRand)) || listFact[1] == "None")
-        {
-            RespFact <- paste("(1|",paste(listRand,collapse=") + (1|"),")")
-            listF <- NULL
-            listFact <- listRand
-        }else{
-            listF <- listFact[!is.element(listFact,listRand)]
-            RespFact <- paste(paste(listF, collapse=" + ")," + (1|",paste(listRand,collapse=") + (1|"),")")
-            listFact <- c(listF,listRand)
-        }   
-    }else{
-        listF <- listFact
-        RespFact <- paste(listFact, collapse=" + ")
-    }
+    out_fact <- .GlobalEnv$organise_fact(list_rand = list_rand, list_fact = list_fact)
+    resp_fact <- out_fact[[1]]
+    list_f <- out_fact[[2]]
+    list_fact <- out_fact[[3]]
 
     ##Creating model's expression :
-
-    #if (log == FALSE) {
-        exprML <- eval(parse(text=paste(metrique, "~", RespFact)))
-    #}else{
-     #   exprML <- eval(parse(text=paste("log(",metrique,")", "~", RespFact)))
-    #}
+    expr_lm <- eval(parse(text = paste(metrique, "~", resp_fact)))
 
     ##Creating analysis table :
 
-    listFactTab <- c(listFact, FactAna)
-    listFactTab <- listFactTab[listFactTab != "None"]
+    list_fact_tab <- c(list_fact, fact_ana)
+    list_fact_tab <- list_fact_tab[list_fact_tab != "None"]
 
-    if (all(is.na(match(tmpData[,unitobs],tabUnitobs[,unitobs])))) {stop("Observation units doesn't match in the two input tables")}
+    if (all(is.na(match(tmpd_ata[, unitobs], tab_unitobs[, unitobs])))) {
+        stop("Observation units doesn't match in the two input tables")
+    }
 
-    if(! is.element("species.code",colnames(tmpData)))
-    {
-        col <- c(unitobs,metrique)
-        tmpData <- cbind(tmpData[,col], tabUnitobs[match(tmpData[,unitobs],tabUnitobs[,unitobs]),listFactTab])
-        colnames(tmpData) <- c(col,listFactTab)
+    if (! is.element("species.code", colnames(tmpd_ata))) {
+        col <- c(unitobs, metrique)
+        tmpd_ata <- cbind(tmpd_ata[, col], tab_unitobs[match(tmpd_ata[, unitobs], tab_unitobs[, unitobs]), list_fact_tab])
+        colnames(tmpd_ata) <- c(col, list_fact_tab)
 
-        for (i in listFactTab) {
+        for (i in list_fact_tab) {
             switch(i,
-                  tmpData[,i] <- as.factor(tmpData[,i]))
+                  tmpd_ata[, i] <- as.factor(tmpd_ata[, i]))
          }
     }else{
         stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site)")
     }
 
     ## Suppress unsed 'levels' :
-    tmpData <- dropLevels.f(tmpData)
+    tmpd_ata <- .GlobalEnv$drop_levels_f(tmpd_ata)
 
     ## Automatic choice of distribution if none is selected by user :
-    if (Distrib == "None") 
-    {
-        switch(class(tmpData[,metrique]),
-              "integer"={loiChoisie <- "poisson"},
-              "numeric"={loiChoisie <- "gaussian"},
-              stop("Selected metric class doesn't fit, you should select an integer or a numeric variable"))
+
+    chose_distrib <- .GlobalEnv$distrib_choice(distrib = distrib, metrique = metrique, data = tmpd_ata)
+
+    if (fact_ana != "None" && nlevels(tmpd_ata[, fact_ana]) > 1) {
+        ana_cut <- levels(tmpd_ata[, fact_ana])
     }else{
-        loiChoisie <- Distrib
-    }
-
-
-    ## Compute Model(s) :
-
-    if (FactAna != "None" && nlevels(tmpData[,FactAna]) > 1)
-    {
-        Anacut <- levels(tmpData[,FactAna])
-    }else{
-        Anacut <- NULL
+        ana_cut <- NULL
     }
 
-    ##Create results table : 
-    lev <- unlist(lapply(listF,FUN=function(x){levels(tmpData[,x])}))
-
-    if (listRand[1] != "None") ## if random effects
-    {
-        TabSum <- data.frame(analysis=c("global", Anacut),AIC=NA,BIC=NA,logLik=NA, deviance=NA,df.resid=NA)
-        colrand <- unlist(lapply(listRand, 
-                           FUN=function(x){lapply(c("Std.Dev","NbObservation","NbLevels"),
-                                                  FUN=function(y){paste(x,y,collapse = ":")
-                                                                 })
-                                          }))
-        TabSum[,colrand] <- NA
+    ##Create results table :
+    lev <- unlist(lapply(list_f, FUN = function(x) {
+                                                        levels(tmpd_ata[, x])
+                                                   }))
+    row <- c("global", ana_cut)
 
-        if (! is.null(lev)) ## if fixed effects + random effects
-        {
-            colcoef <- unlist(lapply(c("(Intercept)",lev),
-                               FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"),
-                                                      FUN=function(y){paste(x,y,collapse = ":")
-                                                                     })
-                                              }))
-        }else{ ## if no fixed effects
-            colcoef <- NULL
-        }
+    if (is.element("year", list_f) && ! is.element("year", list_rand)) {
+        tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = unlist(c("year", lev)), distrib = chose_distrib)
+    }else{
+        tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = lev, distrib = chose_distrib)
+    }
+    ### creating rate table
+    tab_rate <- data.frame(analysis = row, complete_plan = NA, balanced_plan = NA, NA_proportion_OK = NA, no_residual_dispersion = NA, uniform_residuals = NA, outliers_proportion_OK = NA, no_zero_inflation = NA, observation_factor_ratio_OK = NA, enough_levels_random_effect = NA, rate = NA)
 
-    }else{ ## if no random effects
-        TabSum <- data.frame(analysis=c("global", Anacut),AIC=NA,Resid.deviance=NA,df.resid=NA,Null.deviance=NA,df.null=NA)
+    ##plural analysis
+    for (cut in ana_cut) {
+        cutd_ata <- tmpd_ata[grep(cut, tmpd_ata[, fact_ana]), ]
+        cutd_ata <- .GlobalEnv$drop_levels_f(cutd_ata)
+
+        res <- ""
+        resy <- ""
 
-        switch(loiChoisie,
-               "gaussian"={colcoef <- unlist(lapply(c("(Intercept)",lev),
-                                             FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"),
-                                                                    FUN=function(y){paste(x,y,collapse = ":")
-                                                                                   })
-                                                            }))},
-               "quasipoisson"={colcoef <- unlist(lapply(c("(Intercept)",lev),
-                                             FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"),
-                                                                    FUN=function(y){paste(x,y,collapse = ":")
-                                                                                   })
-                                                            }))},
-               colcoef <- unlist(lapply(c("(Intercept)",lev),
-                                        FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"),
-                                                               FUN=function(y){paste(x,y,collapse = ":")
-                                                                              })
-                                                       })))
+        if (list_rand[1] != "None") {
+            res <- tryCatch(glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
+                                                                                                           })
+            if (is.element("year", list_f) && ! is.element("year", list_rand)) { #Model with year as continuous
+                cutd_ata$year <- as.numeric(cutd_ata$year)
+                resy <- tryCatch(glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
+                                                                                                                })
+                cutd_ata$year <- as.factor(cutd_ata$year)
+            }else{
+                resy <- ""
+            }
 
-    }  
-  
-    TabSum[,colcoef] <- NA
-
-    ### creating rate table 
-    TabRate <- data.frame(analysis=c("global", Anacut), complete_plan=NA, balanced_plan=NA, NA_proportion_OK=NA, no_residual_dispersion=NA, uniform_residuals=NA, outliers_proportion_OK=NA, no_zero_inflation=NA, observation_factor_ratio_OK=NA, enough_levels_random_effect=NA, rate=NA)
+        }else{
+            res <- tryCatch(glm(expr_lm, data = cutd_ata, family = chose_distrib), error = function(e) {
+                                                                                                       })
+            if (is.element("year", list_f)) { #Model with year as continuous
+                cutd_ata$year <- as.numeric(cutd_ata$year)
+                resy <- tryCatch(glm(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
+                                                                                                            })
+                cutd_ata$year <- as.factor(cutd_ata$year)
+            }else{
+                resy <- ""
+            }
 
-    for (cut in Anacut) 
-    {
-        cutData <- tmpData[grep(cut,tmpData[,FactAna]),]
-        cutData <- dropLevels.f(cutData)
-
-        res <-""
-
-        if (listRand[1] != "None")
-        {
-            res <- tryCatch(glmmTMB(exprML,family=loiChoisie, data=cutData), error=function(e){})
-        }else{
-            res <- tryCatch(glm(exprML,data=cutData,family=loiChoisie), error=function(e){})
         }
 
           ## Write results :
-         if (! is.null(res))
-         {
-            TabSum <- sortiesLM.f(objLM=res, TabSum=TabSum, metrique=metrique,
-                                  factAna=factAna, cut=cut, colAna="analysis", lev=lev, #modSel=iFactGraphSel, listFactSel=listFactSel,
-                                  listFact=listFact,
-                                  Data=cutData, #Log=Log,
-                                  type=ifelse(tableMetrique == "unitSpSz" && factAna != "size.class",
-                                              "CL_unitobs",
-                                              "unitobs"))
+         if (! is.null(res)) {
+            file_save_glm_cut <- paste("GLM_", cut, ".Rdata", sep = "")
+            save(res, file = file_save_glm_cut)
 
-            TabRate[TabRate[,"analysis"]==cut,c(2:11)] <- noteGLM.f(data=cutData, objLM=res, metric=metrique, listFact=listFact, details=TRUE)
+            tab_sum <- sorties_lm_f(obj_lm = res, obj_lmy = resy, tab_sum = tab_sum, metrique = metrique,
+                                  fact_ana = fact_ana, cut = cut, col_ana = "analysis", lev = lev, #modSel = iFactGraphSel, list_fact_sel = list_fact_sel,
+                                  list_fact = list_fact, list_rand = list_rand,
+                                  d_ata = cutd_ata)
+
+            tab_rate[tab_rate[, "analysis"] == cut, c(2:11)] <- .GlobalEnv$note_glm_f(data = cutd_ata, obj_lm = res, metric = metrique, list_fact = list_fact, details = TRUE)
 
         }else{
-            cat("\nCannot compute GLM for level",cut,"Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n")
+            cat("\nCannot compute GLM for level", cut, "Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n")
         }
 
     }
 
-    ## Global analysis : 
+    ## Global analysis :
+
+    res_g <- ""
+    res_gy <- ""
 
-    if (listRand[1] != "None")
-    {
-        resG <- glmmTMB(exprML,family=loiChoisie, data=tmpData)
+    if (list_rand[1] != "None") {
+        res_g <- glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = tmpd_ata)
+        if (is.element("year", list_fact) && ! is.element("year", list_rand)) { #Model with year as continuous
+            yr <- tmpd_ata$year
+            tmpd_ata$year <- as.numeric(tmpd_ata$year)
+            res_gy <- glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = tmpd_ata)
+            tmpd_ata$year <- as.factor(tmpd_ata$year)
+            tmpd_ata$year <- yr
+        }else{
+            res_gy <- ""
+        }
+
     }else{
-        resG <- glm(exprML,data=tmpData,family=loiChoisie)
+        res_g <- glm(expr_lm, data = tmpd_ata, family = chose_distrib)
+        if (is.element("year", list_fact)) { #Model with year as continuous
+            yr <- tmpd_ata$year
+            tmpd_ata$year <- as.numeric(tmpd_ata$year)
+            res_gy <- glm(expr_lm, family = chose_distrib, data = tmpd_ata)
+            tmpd_ata$year <- yr
+        }else{
+            res_gy <- ""
+        }
     }
 
     ## write results :
-    TabSum <- sortiesLM.f(objLM=resG, TabSum=TabSum, metrique=metrique,
-                          factAna=factAna, cut="global", colAna="analysis", lev=lev, #modSel=iFactGraphSel, listFactSel=listFactSel,
-                          listFact=listFact,
-                          Data=tmpData, #Log=Log,
-                          type=ifelse(tableMetrique == "unitSpSz" && factAna != "size.class",
-                                      "CL_unitobs",
-                                      "unitobs"))
+
+    save(res_g, file = "global_GLM.Rdata")
 
-    TabRate[TabRate[,"analysis"]=="global",c(2:11)] <- noteGLM.f(data=tmpData, objLM=resG, metric=metrique, listFact=listFact, details=TRUE)
-    noteGLMs.f(tabRate=TabRate,exprML=exprML,objLM=resG, file_out=TRUE)
+    tab_sum <- sorties_lm_f(obj_lm = res_g, obj_lmy = res_gy, tab_sum = tab_sum, metrique = metrique,
+                          fact_ana = fact_ana, cut = "global", col_ana = "analysis", lev = lev, #modSel = iFactGraphSel, list_fact_sel = list_fact_sel,
+                          list_fact = list_fact, list_rand = list_rand,
+                          d_ata = tmpd_ata)
+
+    tab_rate[tab_rate[, "analysis"] == "global", c(2:11)] <- .GlobalEnv$note_glm_f(data = tmpd_ata, obj_lm = res_g, metric = metrique, list_fact = list_fact, details = TRUE)
+    .GlobalEnv$note_glms_f(tab_rate = tab_rate, expr_lm = expr_lm, obj_lm = res_g, file_out = TRUE)
+
     ## simple statistics and infos :
     filename <- "GLMSummaryFull.txt"
+    info_stats_f(filename = filename, d_ata = tmpd_ata, agreg_level = aggreg, type = "stat",
+                metrique = metrique, fact_graph = fact_ana, #fact_graph_sel = modSel,
+                list_fact = list_fact)#, list_fact_sel = list_fact_sel)
 
-    ## Save data on model :
-        
-    infoStats.f(filename=filename, Data=tmpData, agregLevel=aggreg, type="stat",
-                metrique=metrique, factGraph=factAna, #factGraphSel=modSel,
-                listFact=listFact)#, listFactSel=listFactSel)
+    tab_sum$separation <- fact_ana
 
-    return(TabSum)
+    return(tab_sum)
 
 }
 
 ################# Analysis
 
-Tab <- modeleLineaireWP2.unitobs.f(metrique=metric, listFact=listFact, listRand=listRand, FactAna=FactAna, Distrib=Distrib, log=log, tabMetrics=obs, tableMetrique=aggreg, tabUnitobs=tabUnitobs, nbName="number")
+tab <- glm_community(metrique = metric, list_fact = list_fact, list_rand = list_rand, fact_ana = fact_ana, distrib = distrib, tab_metrics = obs, tab_metrique = aggreg, tab_unitobs = tab_unitobs, nb_name = "number")
 
-write.table(Tab,"GLMSummary.tabular", row.names=FALSE, sep="\t", dec=".",fileEncoding="UTF-8")
+write.table(tab, "GLMSummary.tabular", row.names = FALSE, sep = "\t", dec = ".", fileEncoding = "UTF-8")