comparison FunctExeCalcGLMSpGalaxy.r @ 2:6c14021f678e draft

"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 3df1978827a91be30e815dee2ed83a92862d1b1c"
author ecology
date Sun, 22 Nov 2020 18:40:40 +0000
parents 0778efa9eb2e
children c12897ba5f83
comparison
equal deleted inserted replaced
1:e972fe2bffee 2:6c14021f678e
1 #Rscript 1 #Rscript
2 2
3 ##################################################################################################################### 3 #####################################################################################################################
4 ##################################################################################################################### 4 #####################################################################################################################
5 ################################# Compute a Generalized Linear Model from your data ################################# 5 ################################# Compute a Generalized Linear Model from your data #################################
6 ##################################################################################################################### 6 #####################################################################################################################
7 ##################################################################################################################### 7 #####################################################################################################################
8 8
9 ###################### Packages 9 ###################### Packages
10 #suppressMessages(library(MASS))
11 suppressMessages(library(multcomp)) 10 suppressMessages(library(multcomp))
11 suppressMessages(library(DHARMa))
12 suppressMessages(library(glmmTMB)) ###Version: 0.2.3 12 suppressMessages(library(glmmTMB)) ###Version: 0.2.3
13 suppressMessages(library(gap)) 13 suppressMessages(library(gap))
14 14
15
15 ###################### Load arguments and declaring variables 16 ###################### Load arguments and declaring variables
16 17
17 args = commandArgs(trailingOnly=TRUE) 18 args <- commandArgs(trailingOnly = TRUE)
18 #options(encoding = "UTF-8")
19 19
20 if (length(args) < 10) { 20 if (length(args) < 10) {
21 stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call.=FALSE) #si pas d'arguments -> affiche erreur et quitte / if no args -> error and exit1 21 stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call. = FALSE) # if no args -> error and exit1
22 22
23 } else { 23 } else {
24 Importdata <- args[1] ###### file name : metrics table 24 import_data <- args[1] ###### file name : metrics table
25 ImportUnitobs <- args[2] ###### file name : unitobs informations 25 import_unitobs <- args[2] ###### file name : unitobs informations
26 colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM 26 colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM
27 listFact <- strsplit(args [4],",")[[1]] ###### Selected response factors for GLM 27 list_fact <- strsplit(args [4], ",")[[1]] ###### Selected response factors for GLM
28 listRand <- strsplit(args [5],",")[[1]] ###### Selected randomized response factors for GLM 28 list_rand <- strsplit(args [5], ",")[[1]] ###### Selected randomized response factors for GLM
29 colFactAna <- args[6] ####### (optional) Selected splitting factors for GLMs 29 col_fact_ana <- args[6] ####### (optional) Selected splitting factors for GLMs
30 Distrib <- args[7] ###### (optional) Selected distribution for GLM 30 distrib <- args[7] ###### (optional) Selected distribution for GLM
31 log <- args[8] ###### (Optional) Log on interest metric ? 31 glm_out <- args[8] ###### (Optional) GLM object as Rdata output ?
32 aggreg <- args[9] ###### Aggregation level of the data table 32 aggreg <- args[9] ###### Aggregation level of the data table
33 source(args[10]) ###### Import functions 33 source(args[10]) ###### Import functions
34 34
35 } 35 }
36 #### Data must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number") 36 #### d_ata must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number")
37 37
38 38
39 #Import des données / Import data 39 #Import des données / Import data
40 obs<- read.table(Importdata,sep="\t",dec=".",header=TRUE,encoding="UTF-8") # 40 obs <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
41 obs[obs == -999] <- NA 41 obs[obs == -999] <- NA
42 metric <- colnames(obs)[colmetric] 42 metric <- colnames(obs)[colmetric]
43 tabUnitobs <- read.table(ImportUnitobs,sep="\t",dec=".",header=TRUE,encoding="UTF-8") 43 tab_unitobs <- read.table(import_unitobs, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8")
44 tabUnitobs[tabUnitobs == -999] <- NA 44 tab_unitobs[tab_unitobs == -999] <- NA
45 45
46 vars_data1<- c("species.code") 46 vars_data1 <- c("species.code")
47 err_msg_data1<-"The input metrics dataset doesn't have the right format. It needs to have at least the following 3 variables :\n- species.code \n- observation.unit (or year and site)\n- numeric or integer metric\n" 47 err_msg_data1 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 3 variables :\n- species.code \n- observation.unit (or year and site)\n- numeric or integer metric\n"
48 check_file(obs,err_msg_data1,vars_data1,3) 48 check_file(obs, err_msg_data1, vars_data1, 3)
49 49
50 vars_data2 <- c(listFact,listRand) 50 vars_data2 <- c("observation.unit", list_fact, list_rand)
51 vars_data2 <- vars_data2[vars_data2 != "None"] 51 err_msg_data2 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n"
52 err_msg_data2<-"The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n" 52 check_file(tab_unitobs, err_msg_data2, vars_data2[vars_data2 != "None"], 2)
53 check_file(tabUnitobs,err_msg_data2,vars_data2,2) 53
54 54
55 55 if (col_fact_ana != "None") {
56 if (colFactAna != "None") 56 fact_ana <- col_fact_ana
57 { 57 if (class(obs[fact_ana]) == "numeric" || fact_ana == "observation.unit") {
58 FactAna <- colFactAna 58 stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")
59 if (class(obs[FactAna]) == "numeric" || FactAna == "observation.unit"){stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")} 59 }
60 }else{ 60 }else{
61 FactAna <- colFactAna 61 fact_ana <- col_fact_ana
62 } 62 }
63 63
64 64 if (all(c(list_fact, list_rand) == "None")) {
65 #factors <- fact.det.f(Obs=obs) 65 stop("GLM needs to have at least one response variable.")
66 }
67
68 if (list_fact[1] == "None" || all(is.element(list_fact, list_rand))) {
69 stop("GLM can't have only random effects.")
70 }
66 71
67 #################################################################################################### 72 ####################################################################################################
68 ########## Computing Generalized Linear Model ## Function : modeleLineaireWP2.unitobs.f ############ 73 ########## Computing Generalized Linear Model ## Function : glm_species ############
69 #################################################################################################### 74 ####################################################################################################
70 75
71 modeleLineaireWP2.species.f <- function(metrique, listFact, listRand, FactAna, Distrib, log=FALSE, tabMetrics, tableMetrique, tabUnitobs, unitobs="observation.unit", outresiduals = FALSE, nbName="number") 76 glm_species <- function(metrique, list_fact, list_rand, fact_ana, distrib, tab_metrics, tab_metrique, tab_unitobs, unitobs = "observation.unit", nb_name = "number") {
72 { 77 ## Purpose: Monitoring steps for GLM on species data
73 ## Purpose: Gestions des différentes étapes des modèles linéaires.
74 ## ---------------------------------------------------------------------- 78 ## ----------------------------------------------------------------------
75 ## Arguments: metrique : la métrique choisie. 79 ## Arguments: metrique : selected metric
76 ## factAna : le facteur de séparation des graphiques. 80 ## list_fact : Factors for GLM
77 ## factAnaSel : la sélection de modalités pour ce dernier 81 ## list_rand : Random factors for GLM
78 ## listFact : liste du (des) facteur(s) de regroupement 82 ## fact_ana : Separation factor for GLMs
79 ## listFactSel : liste des modalités sélectionnées pour ce(s) 83 ## distrib : selected distribution for model
80 ## dernier(s) 84 ## tab_metrics : data table metrics
81 ## tabMetrics : table de métriques. 85 ## tab_metrique : data table's name
82 ## tableMetrique : nom de la table de métriques. 86 ## tab_unitobs : data table unitobs
83 ## dataEnv : environnement de stockage des données.
84 ## baseEnv : environnement de l'interface.
85 ## ---------------------------------------------------------------------- 87 ## ----------------------------------------------------------------------
86 ## Author: Yves Reecht, Date: 18 août 2010, 15:59 88 ## Author: Yves Reecht, Date: 18 août 2010, 15:59 modified by Coline ROYAUX 04 june 2020
87 89
88 tmpData <- tabMetrics 90 tmpd_ata <- tab_metrics
89 91
90 if (listRand[1] != "None") 92 out_fact <- .GlobalEnv$organise_fact(list_rand = list_rand, list_fact = list_fact)
91 { 93 resp_fact <- out_fact[[1]]
92 if (all(is.element(listFact,listRand)) || listFact[1] == "None") 94 list_f <- out_fact[[2]]
93 { 95 list_fact <- out_fact[[3]]
94 RespFact <- paste("(1|",paste(listRand,collapse=") + (1|"),")") 96
95 listF <- NULL
96 listFact <- listRand
97 }else{
98 listF <- listFact[!is.element(listFact,listRand)]
99 RespFact <- paste(paste(listF, collapse=" + ")," + (1|",paste(listRand,collapse=") + (1|"),")")
100 listFact <- c(listF,listRand)
101 }
102 }else{
103 listF <- listFact
104 RespFact <- paste(listFact, collapse=" + ")
105 }
106 ##Creating model's expression : 97 ##Creating model's expression :
107 #if (log == FALSE) { 98 expr_lm <- eval(parse(text = paste(metrique, "~", resp_fact)))
108 exprML <- eval(parse(text=paste(metrique, "~", RespFact)))
109 #}else{
110 # exprML <- eval(parse(text=paste("log(",metrique,")", "~", RespFact)))
111 #}
112 99
113 ##Creating analysis table : 100 ##Creating analysis table :
114 listFactTab <- c(listFact,FactAna) 101 list_fact_tab <- c(list_fact, fact_ana)
115 listFactTab <- listFactTab[listFactTab != "None"] 102 list_fact_tab <- list_fact_tab[list_fact_tab != "None"]
116 103
117 if (all(is.na(match(tmpData[,unitobs],tabUnitobs[,unitobs])))) {stop("Observation units doesn't match in the two input tables")} 104 if (all(is.na(match(tmpd_ata[, unitobs], tab_unitobs[, unitobs])))) {
118 105 stop("Observation units doesn't match in the two input tables")
119 if(is.element("species.code",colnames(tmpData))) 106 }
120 { 107
121 col <- c(unitobs,metrique,FactAna) 108 if (is.element("species.code", colnames(tmpd_ata))) {
122 tmpData <- cbind(tmpData[,col], tabUnitobs[match(tmpData[,unitobs],tabUnitobs[,unitobs]),listFact]) 109 col <- c(unitobs, metrique, fact_ana)
123 colnames(tmpData) <- c(col,listFact) 110 tmpd_ata <- cbind(tmpd_ata[, col], tab_unitobs[match(tmpd_ata[, unitobs], tab_unitobs[, unitobs]), list_fact])
124 111 colnames(tmpd_ata) <- c(col, list_fact)
125 for (i in listFactTab) { 112
126 tmpData[,i] <- as.factor(tmpData[,i]) 113 for (i in list_fact_tab) {
114 tmpd_ata[, i] <- as.factor(tmpd_ata[, i])
127 } 115 }
128 }else{ 116 }else{
129 stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site) and species") 117 stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site) and species")
130 } 118 }
131 119
132 ## Suppression des 'levels' non utilisés : 120 ## Suppression des 'levels' non utilisés :
133 tmpData <- dropLevels.f(tmpData) 121 tmpd_ata <- .GlobalEnv$drop_levels_f(tmpd_ata)
134 122
135 ## Aide au choix du type d'analyse : 123 ## Automatic choice of distribution if none is selected by user :
136 if (Distrib == "None") 124
137 { 125 chose_distrib <- .GlobalEnv$distrib_choice(distrib = distrib, metrique = metrique, data = tmpd_ata)
138 if (metrique == "pres.abs") 126
139 { 127 ##Create results table :
140 loiChoisie <- "binomial" 128 lev <- unlist(lapply(list_f, FUN = function(x) {
129 levels(tmpd_ata[, x])
130 }))
131 row <- levels(tmpd_ata[, fact_ana])
132
133 if (is.element("year", list_f) && ! is.element("year", list_rand)) {
134 tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = unlist(c("year", lev)), distrib = chose_distrib)
135 }else{
136 tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = lev, distrib = chose_distrib)
137 }
138 ### creating rate table
139 tab_rate <- data.frame(species = row, complete_plan = NA, balanced_plan = NA, NA_proportion_OK = NA, no_residual_dispersion = NA, uniform_residuals = NA, outliers_proportion_OK = NA, no_zero_inflation = NA, observation_factor_ratio_OK = NA, enough_levels_random_effect = NA, rate = NA)
140
141 ## Compute Model(s) :
142
143 for (sp in levels(tmpd_ata[, fact_ana])) {
144 cutd_ata <- tmpd_ata[grep(sp, tmpd_ata[, fact_ana]), ]
145 cutd_ata <- .GlobalEnv$drop_levels_f(cutd_ata)
146
147 res <- ""
148 resy <- ""
149
150 if (list_rand[1] != "None") {
151 res <- tryCatch(glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
152 })
153
154 if (is.element("year", list_f) && ! is.element("year", list_rand)) { #Model with year as continuous
155 cutd_ata$year <- as.numeric(cutd_ata$year)
156 resy <- tryCatch(glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
157 })
158 cutd_ata$year <- as.factor(cutd_ata$year)
159 }else{
160 resy <- ""
161 }
141 }else{ 162 }else{
142 switch(class(tmpData[,metrique]), 163 res <- tryCatch(glm(expr_lm, data = cutd_ata, family = chose_distrib), error = function(e) {
143 "integer"={loiChoisie <- "poisson"}, 164 })
144 "numeric"={loiChoisie <- "gaussian"}, 165 if (is.element("year", list_f)) { #Model with year as continuous
145 stop("Selected metric class doesn't fit, you should select an integer or a numeric variable")) 166 cutd_ata$year <- as.numeric(cutd_ata$year)
167 resy <- tryCatch(glm(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) {
168 })
169 cutd_ata$year <- as.factor(cutd_ata$year)
170 }else{
171 resy <- ""
172 }
146 } 173 }
147 }else{ 174
148 loiChoisie <- Distrib 175 ## Write results :
149 } 176 if (! is.null(res)) {
150 177 file_save_glm_sp <- paste("GLM_", sp, ".Rdata", sep = "")
151 ##Create results table : 178 save(res, file = file_save_glm_sp)
152 lev <- unlist(lapply(listF,FUN=function(x){levels(tmpData[,x])})) 179
153 180 tab_sum <- .GlobalEnv$sorties_lm_f(obj_lm = res, obj_lmy = resy, tab_sum = tab_sum, fact_ana = fact_ana, cut = sp, col_ana = "analysis", lev = lev, d_ata = cutd_ata, metrique = metrique, list_fact = list_fact, list_rand = list_rand)
154 if (listRand[1] != "None") ## if random effects 181
155 { 182 tab_rate[tab_rate[, "species"] == sp, c(2:11)] <- .GlobalEnv$note_glm_f(data = cutd_ata, obj_lm = res, metric = metrique, list_fact = list_fact, details = TRUE)
156 TabSum <- data.frame(species=levels(tmpData[,FactAna]),AIC=NA,BIC=NA,logLik=NA, deviance=NA,df.resid=NA) 183
157 colrand <- unlist(lapply(listRand, 184 }else{
158 FUN=function(x){lapply(c("Std.Dev","NbObservation","NbLevels"), 185 cat("\nCannot compute GLM for species", sp, "Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n")
159 FUN=function(y){paste(x,y,collapse = ":")
160 })
161 }))
162 TabSum[,colrand] <- NA
163
164 if (! is.null(lev)) ## if fixed effects + random effects
165 {
166 colcoef <- unlist(lapply(c("(Intercept)",lev),
167 FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"),
168 FUN=function(y){paste(x,y,collapse = ":")
169 })
170 }))
171 }else{ ## if no fixed effects
172 colcoef <- NULL
173 } 186 }
174 187
175 }else{ ## if no random effects 188 }
176 TabSum <- data.frame(species=levels(tmpData[,FactAna]),AIC=NA,Resid.deviance=NA,df.resid=NA,Null.deviance=NA,df.null=NA) 189 .GlobalEnv$note_glms_f(tab_rate = tab_rate, expr_lm = expr_lm, obj_lm = res, file_out = TRUE)
177
178 switch(loiChoisie,
179 "gaussian"={colcoef <- unlist(lapply(c("(Intercept)",lev),
180 FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"),
181 FUN=function(y){paste(x,y,collapse = ":")
182 })
183 }))},
184 "quasipoisson"={colcoef <- unlist(lapply(c("(Intercept)",lev),
185 FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"),
186 FUN=function(y){paste(x,y,collapse = ":")
187 })
188 }))},
189 colcoef <- unlist(lapply(c("(Intercept)",lev),
190 FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"),
191 FUN=function(y){paste(x,y,collapse = ":")
192 })
193 })))
194
195 }
196
197 TabSum[,colcoef] <- NA
198
199 ### creating rate table
200 TabRate <- data.frame(species=levels(tmpData[,FactAna]), complete_plan=NA, balanced_plan=NA, NA_proportion_OK=NA, no_residual_dispersion=NA, uniform_residuals=NA, outliers_proportion_OK=NA, no_zero_inflation=NA, observation_factor_ratio_OK=NA, enough_levels_random_effect=NA, rate=NA)
201
202 ## Compute Model(s) :
203
204 for (sp in levels(tmpData[,FactAna]))
205 {
206 cutData <- tmpData[grep(sp,tmpData[,FactAna]),]
207 cutData <- dropLevels.f(cutData)
208
209 res <-""
210
211 if (listRand[1] != "None")
212 {
213 res <- tryCatch(glmmTMB(exprML,family=loiChoisie, data=cutData), error=function(e){})
214 }else{
215 res <- tryCatch(glm(exprML,data=cutData,family=loiChoisie), error=function(e){})
216 }
217
218 ## Écriture des résultats formatés dans un fichier :
219 if (! is.null(res))
220 {
221 TabSum <- sortiesLM.f(objLM=res, TabSum=TabSum, factAna=factAna, cut=sp, colAna="species", lev=lev, Data=cutData, metrique=metrique, type="espece", listFact=listFact)
222
223 TabRate[TabRate[,"species"]==sp,c(2:11)] <- noteGLM.f(data=cutData, objLM=res, metric=metrique, listFact=listFact, details=TRUE)
224
225 }else{
226 cat("\nCannot compute GLM for species",sp,"Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n")
227 }
228
229 }
230 noteGLMs.f(tabRate=TabRate,exprML=exprML,objLM=res,file_out=TRUE)
231 190
232 ## simple statistics and infos : 191 ## simple statistics and infos :
233 filename <- "GLMSummaryFull.txt" 192 filename <- "GLMSummaryFull.txt"
234 193
235 ## Save data on model : 194 ## Save data on model :
236 195
237 infoStats.f(filename=filename, Data=tmpData, agregLevel=aggreg, type="stat", 196 info_stats_f(filename = filename, d_ata = tmpd_ata, agreg_level = aggreg, type = "stat",
238 metrique=metrique, factGraph=factAna, #factGraphSel=modSel, 197 metrique = metrique, fact_graph = fact_ana, #fact_graph_sel = modSel,
239 listFact=listFact)#, listFactSel=listFactSel) 198 list_fact = list_fact)#, list_fact_sel = list_fact_sel)
240 199
241 return(TabSum) 200 return(tab_sum)
242 } 201 }
243 202
244 ################# Analysis 203 ################# Analysis
245 204
246 Tab <- modeleLineaireWP2.species.f(metrique=metric, listFact=listFact, listRand=listRand, FactAna=FactAna, Distrib=Distrib, tabMetrics=obs, tableMetrique=aggreg, tabUnitobs=tabUnitobs, outresiduals=SupprOutlay, nbName="number") 205 tab <- glm_species(metrique = metric, list_fact = list_fact, list_rand = list_rand, fact_ana = fact_ana, distrib = distrib, tab_metrics = obs, tab_metrique = aggreg, tab_unitobs = tab_unitobs, nb_name = "number")
247 206
248 write.table(Tab,"GLMSummary.tabular", row.names=FALSE, sep="\t", dec=".",fileEncoding="UTF-8") 207 write.table(tab, "GLMSummary.tabular", row.names = FALSE, sep = "\t", dec = ".", fileEncoding = "UTF-8")
249