Mercurial > repos > ecology > pampa_glmsp
comparison FunctExeCalcGLMSpGalaxy.r @ 2:6c14021f678e draft
"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 3df1978827a91be30e815dee2ed83a92862d1b1c"
author | ecology |
---|---|
date | Sun, 22 Nov 2020 18:40:40 +0000 |
parents | 0778efa9eb2e |
children | c12897ba5f83 |
comparison
equal
deleted
inserted
replaced
1:e972fe2bffee | 2:6c14021f678e |
---|---|
1 #Rscript | 1 #Rscript |
2 | 2 |
3 ##################################################################################################################### | 3 ##################################################################################################################### |
4 ##################################################################################################################### | 4 ##################################################################################################################### |
5 ################################# Compute a Generalized Linear Model from your data ################################# | 5 ################################# Compute a Generalized Linear Model from your data ################################# |
6 ##################################################################################################################### | 6 ##################################################################################################################### |
7 ##################################################################################################################### | 7 ##################################################################################################################### |
8 | 8 |
9 ###################### Packages | 9 ###################### Packages |
10 #suppressMessages(library(MASS)) | |
11 suppressMessages(library(multcomp)) | 10 suppressMessages(library(multcomp)) |
11 suppressMessages(library(DHARMa)) | |
12 suppressMessages(library(glmmTMB)) ###Version: 0.2.3 | 12 suppressMessages(library(glmmTMB)) ###Version: 0.2.3 |
13 suppressMessages(library(gap)) | 13 suppressMessages(library(gap)) |
14 | 14 |
15 | |
15 ###################### Load arguments and declaring variables | 16 ###################### Load arguments and declaring variables |
16 | 17 |
17 args = commandArgs(trailingOnly=TRUE) | 18 args <- commandArgs(trailingOnly = TRUE) |
18 #options(encoding = "UTF-8") | |
19 | 19 |
20 if (length(args) < 10) { | 20 if (length(args) < 10) { |
21 stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call.=FALSE) #si pas d'arguments -> affiche erreur et quitte / if no args -> error and exit1 | 21 stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call. = FALSE) # if no args -> error and exit1 |
22 | 22 |
23 } else { | 23 } else { |
24 Importdata <- args[1] ###### file name : metrics table | 24 import_data <- args[1] ###### file name : metrics table |
25 ImportUnitobs <- args[2] ###### file name : unitobs informations | 25 import_unitobs <- args[2] ###### file name : unitobs informations |
26 colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM | 26 colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM |
27 listFact <- strsplit(args [4],",")[[1]] ###### Selected response factors for GLM | 27 list_fact <- strsplit(args [4], ",")[[1]] ###### Selected response factors for GLM |
28 listRand <- strsplit(args [5],",")[[1]] ###### Selected randomized response factors for GLM | 28 list_rand <- strsplit(args [5], ",")[[1]] ###### Selected randomized response factors for GLM |
29 colFactAna <- args[6] ####### (optional) Selected splitting factors for GLMs | 29 col_fact_ana <- args[6] ####### (optional) Selected splitting factors for GLMs |
30 Distrib <- args[7] ###### (optional) Selected distribution for GLM | 30 distrib <- args[7] ###### (optional) Selected distribution for GLM |
31 log <- args[8] ###### (Optional) Log on interest metric ? | 31 glm_out <- args[8] ###### (Optional) GLM object as Rdata output ? |
32 aggreg <- args[9] ###### Aggregation level of the data table | 32 aggreg <- args[9] ###### Aggregation level of the data table |
33 source(args[10]) ###### Import functions | 33 source(args[10]) ###### Import functions |
34 | 34 |
35 } | 35 } |
36 #### Data must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number") | 36 #### d_ata must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number") |
37 | 37 |
38 | 38 |
39 #Import des données / Import data | 39 #Import des données / Import data |
40 obs<- read.table(Importdata,sep="\t",dec=".",header=TRUE,encoding="UTF-8") # | 40 obs <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") # |
41 obs[obs == -999] <- NA | 41 obs[obs == -999] <- NA |
42 metric <- colnames(obs)[colmetric] | 42 metric <- colnames(obs)[colmetric] |
43 tabUnitobs <- read.table(ImportUnitobs,sep="\t",dec=".",header=TRUE,encoding="UTF-8") | 43 tab_unitobs <- read.table(import_unitobs, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") |
44 tabUnitobs[tabUnitobs == -999] <- NA | 44 tab_unitobs[tab_unitobs == -999] <- NA |
45 | 45 |
46 vars_data1<- c("species.code") | 46 vars_data1 <- c("species.code") |
47 err_msg_data1<-"The input metrics dataset doesn't have the right format. It needs to have at least the following 3 variables :\n- species.code \n- observation.unit (or year and site)\n- numeric or integer metric\n" | 47 err_msg_data1 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 3 variables :\n- species.code \n- observation.unit (or year and site)\n- numeric or integer metric\n" |
48 check_file(obs,err_msg_data1,vars_data1,3) | 48 check_file(obs, err_msg_data1, vars_data1, 3) |
49 | 49 |
50 vars_data2 <- c(listFact,listRand) | 50 vars_data2 <- c("observation.unit", list_fact, list_rand) |
51 vars_data2 <- vars_data2[vars_data2 != "None"] | 51 err_msg_data2 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n" |
52 err_msg_data2<-"The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n" | 52 check_file(tab_unitobs, err_msg_data2, vars_data2[vars_data2 != "None"], 2) |
53 check_file(tabUnitobs,err_msg_data2,vars_data2,2) | 53 |
54 | 54 |
55 | 55 if (col_fact_ana != "None") { |
56 if (colFactAna != "None") | 56 fact_ana <- col_fact_ana |
57 { | 57 if (class(obs[fact_ana]) == "numeric" || fact_ana == "observation.unit") { |
58 FactAna <- colFactAna | 58 stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor") |
59 if (class(obs[FactAna]) == "numeric" || FactAna == "observation.unit"){stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")} | 59 } |
60 }else{ | 60 }else{ |
61 FactAna <- colFactAna | 61 fact_ana <- col_fact_ana |
62 } | 62 } |
63 | 63 |
64 | 64 if (all(c(list_fact, list_rand) == "None")) { |
65 #factors <- fact.det.f(Obs=obs) | 65 stop("GLM needs to have at least one response variable.") |
66 } | |
67 | |
68 if (list_fact[1] == "None" || all(is.element(list_fact, list_rand))) { | |
69 stop("GLM can't have only random effects.") | |
70 } | |
66 | 71 |
67 #################################################################################################### | 72 #################################################################################################### |
68 ########## Computing Generalized Linear Model ## Function : modeleLineaireWP2.unitobs.f ############ | 73 ########## Computing Generalized Linear Model ## Function : glm_species ############ |
69 #################################################################################################### | 74 #################################################################################################### |
70 | 75 |
71 modeleLineaireWP2.species.f <- function(metrique, listFact, listRand, FactAna, Distrib, log=FALSE, tabMetrics, tableMetrique, tabUnitobs, unitobs="observation.unit", outresiduals = FALSE, nbName="number") | 76 glm_species <- function(metrique, list_fact, list_rand, fact_ana, distrib, tab_metrics, tab_metrique, tab_unitobs, unitobs = "observation.unit", nb_name = "number") { |
72 { | 77 ## Purpose: Monitoring steps for GLM on species data |
73 ## Purpose: Gestions des différentes étapes des modèles linéaires. | |
74 ## ---------------------------------------------------------------------- | 78 ## ---------------------------------------------------------------------- |
75 ## Arguments: metrique : la métrique choisie. | 79 ## Arguments: metrique : selected metric |
76 ## factAna : le facteur de séparation des graphiques. | 80 ## list_fact : Factors for GLM |
77 ## factAnaSel : la sélection de modalités pour ce dernier | 81 ## list_rand : Random factors for GLM |
78 ## listFact : liste du (des) facteur(s) de regroupement | 82 ## fact_ana : Separation factor for GLMs |
79 ## listFactSel : liste des modalités sélectionnées pour ce(s) | 83 ## distrib : selected distribution for model |
80 ## dernier(s) | 84 ## tab_metrics : data table metrics |
81 ## tabMetrics : table de métriques. | 85 ## tab_metrique : data table's name |
82 ## tableMetrique : nom de la table de métriques. | 86 ## tab_unitobs : data table unitobs |
83 ## dataEnv : environnement de stockage des données. | |
84 ## baseEnv : environnement de l'interface. | |
85 ## ---------------------------------------------------------------------- | 87 ## ---------------------------------------------------------------------- |
86 ## Author: Yves Reecht, Date: 18 août 2010, 15:59 | 88 ## Author: Yves Reecht, Date: 18 août 2010, 15:59 modified by Coline ROYAUX 04 june 2020 |
87 | 89 |
88 tmpData <- tabMetrics | 90 tmpd_ata <- tab_metrics |
89 | 91 |
90 if (listRand[1] != "None") | 92 out_fact <- .GlobalEnv$organise_fact(list_rand = list_rand, list_fact = list_fact) |
91 { | 93 resp_fact <- out_fact[[1]] |
92 if (all(is.element(listFact,listRand)) || listFact[1] == "None") | 94 list_f <- out_fact[[2]] |
93 { | 95 list_fact <- out_fact[[3]] |
94 RespFact <- paste("(1|",paste(listRand,collapse=") + (1|"),")") | 96 |
95 listF <- NULL | |
96 listFact <- listRand | |
97 }else{ | |
98 listF <- listFact[!is.element(listFact,listRand)] | |
99 RespFact <- paste(paste(listF, collapse=" + ")," + (1|",paste(listRand,collapse=") + (1|"),")") | |
100 listFact <- c(listF,listRand) | |
101 } | |
102 }else{ | |
103 listF <- listFact | |
104 RespFact <- paste(listFact, collapse=" + ") | |
105 } | |
106 ##Creating model's expression : | 97 ##Creating model's expression : |
107 #if (log == FALSE) { | 98 expr_lm <- eval(parse(text = paste(metrique, "~", resp_fact))) |
108 exprML <- eval(parse(text=paste(metrique, "~", RespFact))) | |
109 #}else{ | |
110 # exprML <- eval(parse(text=paste("log(",metrique,")", "~", RespFact))) | |
111 #} | |
112 | 99 |
113 ##Creating analysis table : | 100 ##Creating analysis table : |
114 listFactTab <- c(listFact,FactAna) | 101 list_fact_tab <- c(list_fact, fact_ana) |
115 listFactTab <- listFactTab[listFactTab != "None"] | 102 list_fact_tab <- list_fact_tab[list_fact_tab != "None"] |
116 | 103 |
117 if (all(is.na(match(tmpData[,unitobs],tabUnitobs[,unitobs])))) {stop("Observation units doesn't match in the two input tables")} | 104 if (all(is.na(match(tmpd_ata[, unitobs], tab_unitobs[, unitobs])))) { |
118 | 105 stop("Observation units doesn't match in the two input tables") |
119 if(is.element("species.code",colnames(tmpData))) | 106 } |
120 { | 107 |
121 col <- c(unitobs,metrique,FactAna) | 108 if (is.element("species.code", colnames(tmpd_ata))) { |
122 tmpData <- cbind(tmpData[,col], tabUnitobs[match(tmpData[,unitobs],tabUnitobs[,unitobs]),listFact]) | 109 col <- c(unitobs, metrique, fact_ana) |
123 colnames(tmpData) <- c(col,listFact) | 110 tmpd_ata <- cbind(tmpd_ata[, col], tab_unitobs[match(tmpd_ata[, unitobs], tab_unitobs[, unitobs]), list_fact]) |
124 | 111 colnames(tmpd_ata) <- c(col, list_fact) |
125 for (i in listFactTab) { | 112 |
126 tmpData[,i] <- as.factor(tmpData[,i]) | 113 for (i in list_fact_tab) { |
114 tmpd_ata[, i] <- as.factor(tmpd_ata[, i]) | |
127 } | 115 } |
128 }else{ | 116 }else{ |
129 stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site) and species") | 117 stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site) and species") |
130 } | 118 } |
131 | 119 |
132 ## Suppression des 'levels' non utilisés : | 120 ## Suppression des 'levels' non utilisés : |
133 tmpData <- dropLevels.f(tmpData) | 121 tmpd_ata <- .GlobalEnv$drop_levels_f(tmpd_ata) |
134 | 122 |
135 ## Aide au choix du type d'analyse : | 123 ## Automatic choice of distribution if none is selected by user : |
136 if (Distrib == "None") | 124 |
137 { | 125 chose_distrib <- .GlobalEnv$distrib_choice(distrib = distrib, metrique = metrique, data = tmpd_ata) |
138 if (metrique == "pres.abs") | 126 |
139 { | 127 ##Create results table : |
140 loiChoisie <- "binomial" | 128 lev <- unlist(lapply(list_f, FUN = function(x) { |
129 levels(tmpd_ata[, x]) | |
130 })) | |
131 row <- levels(tmpd_ata[, fact_ana]) | |
132 | |
133 if (is.element("year", list_f) && ! is.element("year", list_rand)) { | |
134 tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = unlist(c("year", lev)), distrib = chose_distrib) | |
135 }else{ | |
136 tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = lev, distrib = chose_distrib) | |
137 } | |
138 ### creating rate table | |
139 tab_rate <- data.frame(species = row, complete_plan = NA, balanced_plan = NA, NA_proportion_OK = NA, no_residual_dispersion = NA, uniform_residuals = NA, outliers_proportion_OK = NA, no_zero_inflation = NA, observation_factor_ratio_OK = NA, enough_levels_random_effect = NA, rate = NA) | |
140 | |
141 ## Compute Model(s) : | |
142 | |
143 for (sp in levels(tmpd_ata[, fact_ana])) { | |
144 cutd_ata <- tmpd_ata[grep(sp, tmpd_ata[, fact_ana]), ] | |
145 cutd_ata <- .GlobalEnv$drop_levels_f(cutd_ata) | |
146 | |
147 res <- "" | |
148 resy <- "" | |
149 | |
150 if (list_rand[1] != "None") { | |
151 res <- tryCatch(glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { | |
152 }) | |
153 | |
154 if (is.element("year", list_f) && ! is.element("year", list_rand)) { #Model with year as continuous | |
155 cutd_ata$year <- as.numeric(cutd_ata$year) | |
156 resy <- tryCatch(glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { | |
157 }) | |
158 cutd_ata$year <- as.factor(cutd_ata$year) | |
159 }else{ | |
160 resy <- "" | |
161 } | |
141 }else{ | 162 }else{ |
142 switch(class(tmpData[,metrique]), | 163 res <- tryCatch(glm(expr_lm, data = cutd_ata, family = chose_distrib), error = function(e) { |
143 "integer"={loiChoisie <- "poisson"}, | 164 }) |
144 "numeric"={loiChoisie <- "gaussian"}, | 165 if (is.element("year", list_f)) { #Model with year as continuous |
145 stop("Selected metric class doesn't fit, you should select an integer or a numeric variable")) | 166 cutd_ata$year <- as.numeric(cutd_ata$year) |
167 resy <- tryCatch(glm(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { | |
168 }) | |
169 cutd_ata$year <- as.factor(cutd_ata$year) | |
170 }else{ | |
171 resy <- "" | |
172 } | |
146 } | 173 } |
147 }else{ | 174 |
148 loiChoisie <- Distrib | 175 ## Write results : |
149 } | 176 if (! is.null(res)) { |
150 | 177 file_save_glm_sp <- paste("GLM_", sp, ".Rdata", sep = "") |
151 ##Create results table : | 178 save(res, file = file_save_glm_sp) |
152 lev <- unlist(lapply(listF,FUN=function(x){levels(tmpData[,x])})) | 179 |
153 | 180 tab_sum <- .GlobalEnv$sorties_lm_f(obj_lm = res, obj_lmy = resy, tab_sum = tab_sum, fact_ana = fact_ana, cut = sp, col_ana = "analysis", lev = lev, d_ata = cutd_ata, metrique = metrique, list_fact = list_fact, list_rand = list_rand) |
154 if (listRand[1] != "None") ## if random effects | 181 |
155 { | 182 tab_rate[tab_rate[, "species"] == sp, c(2:11)] <- .GlobalEnv$note_glm_f(data = cutd_ata, obj_lm = res, metric = metrique, list_fact = list_fact, details = TRUE) |
156 TabSum <- data.frame(species=levels(tmpData[,FactAna]),AIC=NA,BIC=NA,logLik=NA, deviance=NA,df.resid=NA) | 183 |
157 colrand <- unlist(lapply(listRand, | 184 }else{ |
158 FUN=function(x){lapply(c("Std.Dev","NbObservation","NbLevels"), | 185 cat("\nCannot compute GLM for species", sp, "Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n") |
159 FUN=function(y){paste(x,y,collapse = ":") | |
160 }) | |
161 })) | |
162 TabSum[,colrand] <- NA | |
163 | |
164 if (! is.null(lev)) ## if fixed effects + random effects | |
165 { | |
166 colcoef <- unlist(lapply(c("(Intercept)",lev), | |
167 FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"), | |
168 FUN=function(y){paste(x,y,collapse = ":") | |
169 }) | |
170 })) | |
171 }else{ ## if no fixed effects | |
172 colcoef <- NULL | |
173 } | 186 } |
174 | 187 |
175 }else{ ## if no random effects | 188 } |
176 TabSum <- data.frame(species=levels(tmpData[,FactAna]),AIC=NA,Resid.deviance=NA,df.resid=NA,Null.deviance=NA,df.null=NA) | 189 .GlobalEnv$note_glms_f(tab_rate = tab_rate, expr_lm = expr_lm, obj_lm = res, file_out = TRUE) |
177 | |
178 switch(loiChoisie, | |
179 "gaussian"={colcoef <- unlist(lapply(c("(Intercept)",lev), | |
180 FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"), | |
181 FUN=function(y){paste(x,y,collapse = ":") | |
182 }) | |
183 }))}, | |
184 "quasipoisson"={colcoef <- unlist(lapply(c("(Intercept)",lev), | |
185 FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"), | |
186 FUN=function(y){paste(x,y,collapse = ":") | |
187 }) | |
188 }))}, | |
189 colcoef <- unlist(lapply(c("(Intercept)",lev), | |
190 FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"), | |
191 FUN=function(y){paste(x,y,collapse = ":") | |
192 }) | |
193 }))) | |
194 | |
195 } | |
196 | |
197 TabSum[,colcoef] <- NA | |
198 | |
199 ### creating rate table | |
200 TabRate <- data.frame(species=levels(tmpData[,FactAna]), complete_plan=NA, balanced_plan=NA, NA_proportion_OK=NA, no_residual_dispersion=NA, uniform_residuals=NA, outliers_proportion_OK=NA, no_zero_inflation=NA, observation_factor_ratio_OK=NA, enough_levels_random_effect=NA, rate=NA) | |
201 | |
202 ## Compute Model(s) : | |
203 | |
204 for (sp in levels(tmpData[,FactAna])) | |
205 { | |
206 cutData <- tmpData[grep(sp,tmpData[,FactAna]),] | |
207 cutData <- dropLevels.f(cutData) | |
208 | |
209 res <-"" | |
210 | |
211 if (listRand[1] != "None") | |
212 { | |
213 res <- tryCatch(glmmTMB(exprML,family=loiChoisie, data=cutData), error=function(e){}) | |
214 }else{ | |
215 res <- tryCatch(glm(exprML,data=cutData,family=loiChoisie), error=function(e){}) | |
216 } | |
217 | |
218 ## Écriture des résultats formatés dans un fichier : | |
219 if (! is.null(res)) | |
220 { | |
221 TabSum <- sortiesLM.f(objLM=res, TabSum=TabSum, factAna=factAna, cut=sp, colAna="species", lev=lev, Data=cutData, metrique=metrique, type="espece", listFact=listFact) | |
222 | |
223 TabRate[TabRate[,"species"]==sp,c(2:11)] <- noteGLM.f(data=cutData, objLM=res, metric=metrique, listFact=listFact, details=TRUE) | |
224 | |
225 }else{ | |
226 cat("\nCannot compute GLM for species",sp,"Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n") | |
227 } | |
228 | |
229 } | |
230 noteGLMs.f(tabRate=TabRate,exprML=exprML,objLM=res,file_out=TRUE) | |
231 | 190 |
232 ## simple statistics and infos : | 191 ## simple statistics and infos : |
233 filename <- "GLMSummaryFull.txt" | 192 filename <- "GLMSummaryFull.txt" |
234 | 193 |
235 ## Save data on model : | 194 ## Save data on model : |
236 | 195 |
237 infoStats.f(filename=filename, Data=tmpData, agregLevel=aggreg, type="stat", | 196 info_stats_f(filename = filename, d_ata = tmpd_ata, agreg_level = aggreg, type = "stat", |
238 metrique=metrique, factGraph=factAna, #factGraphSel=modSel, | 197 metrique = metrique, fact_graph = fact_ana, #fact_graph_sel = modSel, |
239 listFact=listFact)#, listFactSel=listFactSel) | 198 list_fact = list_fact)#, list_fact_sel = list_fact_sel) |
240 | 199 |
241 return(TabSum) | 200 return(tab_sum) |
242 } | 201 } |
243 | 202 |
244 ################# Analysis | 203 ################# Analysis |
245 | 204 |
246 Tab <- modeleLineaireWP2.species.f(metrique=metric, listFact=listFact, listRand=listRand, FactAna=FactAna, Distrib=Distrib, tabMetrics=obs, tableMetrique=aggreg, tabUnitobs=tabUnitobs, outresiduals=SupprOutlay, nbName="number") | 205 tab <- glm_species(metrique = metric, list_fact = list_fact, list_rand = list_rand, fact_ana = fact_ana, distrib = distrib, tab_metrics = obs, tab_metrique = aggreg, tab_unitobs = tab_unitobs, nb_name = "number") |
247 | 206 |
248 write.table(Tab,"GLMSummary.tabular", row.names=FALSE, sep="\t", dec=".",fileEncoding="UTF-8") | 207 write.table(tab, "GLMSummary.tabular", row.names = FALSE, sep = "\t", dec = ".", fileEncoding = "UTF-8") |
249 |