diff FunctExePlotGLMGalaxy.r @ 0:3ab852a7ff53 draft

"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 04381ca7162ec3ec68419e308194b91d11cacb04"
author ecology
date Mon, 16 Nov 2020 11:02:43 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FunctExePlotGLMGalaxy.r	Mon Nov 16 11:02:43 2020 +0000
@@ -0,0 +1,403 @@
+#Rscript
+
+#####################################################################################################################
+#####################################################################################################################
+###################################### Create a plot from your community data #######################################
+#####################################################################################################################
+#####################################################################################################################
+
+###################### Packages
+suppressMessages(library(ggplot2))
+suppressMessages(library(boot))
+
+###################### Load arguments and declaring variables
+
+args <- commandArgs(trailingOnly = TRUE)
+
+
+if (length(args) < 2) {
+    stop("At least 3 arguments must be supplied input dataset file with GLM results (.tabular)", call. = FALSE) #if no args -> error and exit1
+
+} else {
+    import_data <- args[1] ###### file name : glm results table
+    data_tab <- args[2] ###### file name : Metrics table
+    unitobs_tab <- args[3] ###### file name : Unitobs table
+    source(args[4]) ###### Import functions
+
+}
+
+#Import data
+glmtable <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
+datatable <- read.table(data_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
+unitobs <- read.table(unitobs_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
+
+#Check files
+
+vars_data1 <- c("analysis", "Interest.var", "distribution")
+err_msg_data1 <- "The input GLM results dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- analysis\n- Interest.var\n- distribution\n"
+check_file(glmtable, err_msg_data1, vars_data1, 4)
+
+if (length(grep("[0-2][0|9][0-9][0-9].[Estimate|Pvalue]", colnames(glmtable))) == 0) {
+    stop("The input GLM results dataset doesn't have the right format or informations. This tool is to represent temporal trends, if your GLM doesn't take the year variable as a fixed effect this tool is not proper to make any representation of it. It needs to have at least estimates and p-value for every year from your time series GLM as columns with name formated as : yyyy Estimate (example : 2020 Estimate) and  yyyy Pvalue (example : 2020 Pvalue).")
+}
+
+if (length(grep("[0-2][0|9][0-9][0-9].IC_[up|inf]", colnames(glmtable))) == 0) {
+    assess_ic <- FALSE
+}else{
+    assess_ic <- TRUE
+}
+
+metric <- as.character(glmtable[1, "Interest.var"])
+
+vars_data2 <- c("observation.unit", "location", metric)
+err_msg_data2 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- observation.unit\n- location\n- the name of the interest metric\n"
+check_file(datatable, err_msg_data2, vars_data2, 4)
+
+vars_data3 <- c("observation.unit", "year")
+err_msg_data3 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 fields :\n- observation.unit\n- year\n"
+check_file(unitobs, err_msg_data3, vars_data3, 2)
+if (length(grep("[0-2][0|9][0-9][0-9]", unitobs$year)) == 0) {
+    stop("The year column in the input unitobs dataset doesn't have the right format. Years must be fully written as : yyyy (example : 2020).")
+}
+
+if (all(is.na(match(datatable[, "observation.unit"], unitobs[, "observation.unit"])))) {
+    stop("Observation units doesn't match in the inputs metrics dataset and unitobs dataset")
+}
+
+####################################################################################################################
+######################### Creating plot from time series GLM data ## Function : ggplot_glm #########################
+####################################################################################################################
+ggplot_glm <- function(glmtable, datatable, unitobs, metric = metric, sp, description = TRUE,
+                       trend_on_graph = TRUE, assess_ic = TRUE) {
+    ## Purpose: Creating plot from time series GLM data
+    ## ----------------------------------------------------------------------
+    ## Arguments: glmtable : GLM(s) results table
+    ##            datatable : Metrics table
+    ##            unitobs : Unitobs table
+    ##            metric : Interest variable in GLM(s)
+    ##            sp : name of processed GLM
+    ##            description : Two graphs ?
+    ##            trend_on_graph : Write global trend of the time series on graph ?
+    ##            assess_ic : Assess confidence intervals ?
+    ## ----------------------------------------------------------------------
+    ## Author: Coline ROYAUX 13 october 2020
+
+    s_signif <- 0.05 ## threshold when pvalue is considered significant
+    distrib <- as.character(glmtable[1, "distribution"]) ## extract GLM distribution
+
+    col <- c("observation.unit", "location", metric) ## names of needed columns in metrics table to construct the 2nd panel of the graph
+
+    if (colnames(glmtable)[length(glmtable)] == "separation") { ## if GLM is a community analysis
+        cut <- as.character(glmtable[1, "separation"])
+        if (cut != "None") { ## if there is plural GLM analysis performed
+            if (! is.element(as.character(cut), colnames(unitobs))) {
+                stop("The input unitobs dataset doesn't have the right format. If plural GLM analysis have been performed it needs to have the field used as separation factor.")
+            }
+            datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), c("year", cut)]) ## extracting 'year' and analysis separation factor columns from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
+            colnames(datatable) <- c(col, "year", cut)
+        }else{
+            datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
+            colnames(datatable) <- c(col, "year")
+        }
+
+    }else{ ## GLM is a population analysis
+        cut <- "species.code"
+        if (! is.element("species.code", colnames(datatable))) {
+            stop("The input metrics dataset or the GLM results dataset doesn't have the right format. If the GLM is a population analysis, the field species.code needs to be informed in the metrics dataset. If the GLM is a community analysis the field separation (None if only one analysis and name of the separation factor if several analysis) needs to be informed in the last column of the GLM results dataset.")
+        }
+        col <- c(col, cut)
+        datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
+        colnames(datatable) <- c(col, "year")
+    }
+
+    ##vpan vector of names of the two panels in the ggplot
+
+    switch(as.character(metric),
+           "number" = vpan <- c("Abundance variation", "Raw abundance"),
+           "presence_absence" = vpan <- c("Presence-absence variation", "% presence in location"),
+           vpan <- c(paste(metric, " variation"), paste("Mean ", metric)))
+
+    ##Cut table for 1 analysis
+    glmtab <- glmtable[glmtable[, "analysis"] == sp, ]
+
+    glmtab <- glmtab[, grep("FALSE", is.na(glmtab[1, ]))] ## Supress columns with NA only
+
+    ## specification of temporal variable necessary for the analyses
+    an <- as.numeric(unlist(strsplit(gsub("X", "", paste(colnames(glmtab)[grep("[0-2][0|9][0-9][0-9].Estimate", colnames(glmtab))], collapse = " ")), split = ".Estimate"))) ## Extract list of years studied in the GLM
+
+    year <- sort(c(min(an) - 1, an)) ## Add the first level, "reference" in the GLM
+    nbans <- length(year)
+    pasdetemps <- nbans - 1
+
+    ##### Table 1
+
+    coefan <- unlist(lapply(an, FUN = function(x) {
+                                          if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]) > 0) {
+                                              glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]
+                                          }else{
+                                              NA
+                                          }
+                                                 }))  ## extract estimates for each years to contruct graph 1
+
+    ic_inf <- unlist(lapply(an, FUN = function(x) {
+                                          if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]) > 0) {
+                                              glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]
+                                          }else{
+                                               NA
+                                          }
+                                                  }))
+
+    ic_up <- unlist(lapply(an, FUN = function(x) {
+                                         if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]) > 0) {
+                                             glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]
+                                         }else{
+                                             NA
+                                         }
+                                                }))
+
+    switch(distrib,  ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
+           "poisson" = {
+                            coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
+                            if (assess_ic) {
+                                ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
+                                ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
+                            } else {
+                                ic_inf_sim <- NA
+                                ic_sup_sim <- NA
+                      }},
+           "quasipoisson" = {
+                                 coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
+                                 if (assess_ic) {
+                                     ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
+                                     ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
+                                 } else {
+                                     ic_inf_sim <- NA
+                                     ic_sup_sim <- NA
+                           }},
+           "inverse.gaussian" = {
+                                     coefyear <- c(0, as.numeric(coefan) ^ (- 1 / 2)) ## link function : x^ - 2
+                                     if (assess_ic) {
+                                         ic_inf_sim <- c(0, as.numeric(ic_inf) ^ (- 1 / 2))
+                                         ic_sup_sim <- c(0, as.numeric(ic_up) ^ (- 1 / 2))
+                                     } else {
+                                         ic_inf_sim <- NA
+                                         ic_sup_sim <- NA
+                               }},
+           "binomial" = {
+                             coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
+                             if (assess_ic) {
+                                 ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
+                                 ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
+                             } else {
+                                 ic_inf_sim <- NA
+                                 ic_sup_sim <- NA
+                       }},
+           "quasibinomial" = {
+                                  coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
+                                  if (assess_ic) {
+                                      ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
+                                      ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
+                                  } else {
+                                      ic_inf_sim <- NA
+                                      ic_sup_sim <- NA
+                            }},
+           "Gamma" = {
+                          coefyear <- c(1, as.numeric(coefan) ^ (- 1)) ## link function : -x^ - 1
+                          if (assess_ic) {
+                              ic_inf_sim <- c(1, as.numeric(ic_inf) ^ (- 1))
+                              ic_sup_sim <- c(1, as.numeric(ic_up) ^ (- 1))
+                          } else {
+                              ic_inf_sim <- NA
+                              ic_sup_sim <- NA
+                    }}
+         , {
+                coefyear <- c(0, as.numeric(coefan))
+                if (assess_ic) {
+                    ic_inf_sim <- c(0, as.numeric(ic_inf))
+                    ic_sup_sim <- c(0, as.numeric(ic_up))
+                } else {
+                    ic_inf_sim <- NA
+                    ic_sup_sim <- NA
+            }})
+
+    pval <- c(1, unlist(lapply(an, FUN = function(x) {
+                                             if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]) > 0) {
+                                                 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]
+                                              }else{
+                                                  NA
+                                              }
+                                                    }))) ## extract p value for each year
+
+
+    tab1 <- data.frame(year, val = coefyear,  ## table for the graphical output 1
+                       ll = unlist(ic_inf_sim), ul = unlist(ic_sup_sim),
+                       catPoint = ifelse(pval < s_signif, "significatif", NA), pval,
+                       courbe = vpan[1],
+                       panel = vpan[1])
+    ## cleaning of wrong or biaised measures of the confidence interval
+    if (assess_ic) {
+        tab1$ul <-  ifelse(tab1$ul == Inf, NA, tab1$ul)
+        tab1$ul <-  ifelse(tab1$ul > 1.000000e+20, NA, tab1$ul)
+        tab1$val <-  ifelse(tab1$val > 1.000000e+20, 1.000000e+20, tab1$val)
+    }
+
+    coefancontinu <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Estimate", colnames(glmtab))])) ##  tendency of population evolution on the studied period = regression coefficient of the variable year as a continuous variable in the GLM
+    switch(distrib, ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
+           "poisson" = {
+                           trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
+                            pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
+                       },
+           "quasipoisson" = {
+                                 trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
+                                 pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
+                            },
+           "inverse.gaussian" = {
+                                     trend <- round(as.numeric(coefancontinu) ^ (- 1 / 2), 3) ## link function : x^ - 2
+                                     pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1 / 2)) - 1) * 100, 2)
+                                },
+           "binomial" = {
+                             trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
+                             pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
+                        },
+           "quasibinomial" = {
+                                  trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
+                                  pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
+                             },
+           "Gamma" = {
+                          trend <- round(as.numeric(coefancontinu) ^ (- 1), 3) ## link function : -x^ - 1
+                          pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1)) - 1) * 100, 2)
+                     }
+         , {
+                trend <- round(as.numeric(coefancontinu), 3)
+                pourcentage <-  round((((as.numeric(coefancontinu) * as.numeric(pasdetemps))) - 1) * 100, 2)
+           })
+
+    pval <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Pvalue", colnames(glmtab))])) ## Extract p value
+
+    ## table for global trend on the whole time series
+    tab1t <- NULL
+    if (length(pval) > 0) {
+        tab1t <- data.frame(Est = trend, pourcent = pourcentage, signif = pval < s_signif, pval)
+    }
+
+    ##### Table 2
+
+    if (sp == "global") { ## prepare metrics table to extract variables used in graph 2
+        datatablecut <- datatable[grep("FALSE", is.na(datatable[, as.character(metric)])), ]
+
+    }else{
+
+        datatablecut <- datatable[datatable[, as.character(cut)] == sp, ]
+        datatablecut <- datatablecut[grep("FALSE", is.na(datatablecut[, as.character(metric)])), ]
+    }
+
+    switch(as.character(metric), ## Different value represented graph 2 depending on the nature of the metric
+           "number" = {
+                           valplot <- lapply(sort(year), FUN = function(x) {
+                                                                   sum(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
+                                                                           })
+                      }, ## sum if abundance
+           "presence_absence" = {
+                                     nb_loc <- lapply(sort(year), FUN = function(x) {
+                                                                            length(unique(subset(datatablecut, year == x)[, "location"]))
+                                                                                    }) ## number of plots per year
+                                     nb_loc_presence <- lapply(sort(year), FUN = function(x) {
+                                                                                     length(unique(subset(datatablecut[datatablecut[, metric] > 0, ], year == x)[, "location"]))
+                                                                                             }) ##  number of plots where the species were observed
+                                     valplot <- (na.omit(as.numeric(nb_loc_presence)) / na.omit(as.numeric(nb_loc))) * 100
+                                } ## % of presence in observered plots if presence / absence
+         , {
+                valplot <- lapply(sort(year), FUN = function(x) {
+                                                        mean(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
+                                                                })
+           } ## mean if any other metric
+          )
+
+    tab2 <- data.frame(year, val = round(as.numeric(valplot), 2), ll = NA, ul = NA, catPoint = NA, pval = NA,
+                       courbe = vpan[2], panel = vpan[2])
+
+    ## Creating ggplots
+
+    dgg <- tab1
+
+    figname <- paste(sp, ".png", sep = "")
+
+    ## coord for horizontal lines in graphs
+    hline_data1 <- data.frame(z = tab1$val[1], panel = c(vpan[1]), couleur = "var estimates", type = "var estimates")
+    hline_data3 <- data.frame(z = 0, panel = vpan[2], couleur = "seuil", type = "seuil")
+    hline_data <- rbind(hline_data1, hline_data3)
+    titre <- paste(sp)
+
+    ## text for the population evolution trend
+
+    pasdetemps <- max(dgg$year) - min(dgg$year) + 1
+    if (! is.null(tab1t)) {
+        if (assess_ic) {
+            txt_pente1 <- paste("Global trend : ", tab1t$Est,
+                               ifelse(tab1t$signif, " *", ""),
+                               ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
+                                                         abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), ""), sep = "")
+        }else{
+            txt_pente1 <- ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
+                                                   abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), "")
+        }
+    }else{
+        trend_on_graph <- FALSE
+    }
+
+    ## table of the text for the population evolution trend
+    tab_text_pent <- data.frame(y = c(max(c(dgg$val, dgg$ul), na.rm = TRUE) * .9),
+                              x = median(dgg$year),
+                              txt = ifelse(trend_on_graph, c(txt_pente1), ""),
+                              courbe = c(vpan[1]), panel = c(vpan[1]))
+
+    dgg <- rbind(tab1, tab2)
+
+    ## colors for plots
+    vec_col_point <- c("#ffffff", "#eeb40f", "#ee0f59")
+    names(vec_col_point) <- c("significatif", "infSeuil", "0")
+    vec_col_courbe <- c("#3c47e0", "#5b754d", "#55bb1d", "#973ce0")
+    names(vec_col_courbe) <- c(vpan[1], "loc", "presence", vpan[2])
+    vec_col_hline <- c("#ffffff", "#e76060")
+    names(vec_col_hline) <- c("var estimates", "seuil")
+
+    col <- c(vec_col_point, vec_col_courbe, vec_col_hline)
+    names(col) <- c(names(vec_col_point), names(vec_col_courbe), names(vec_col_hline))
+
+    p <- ggplot2::ggplot(data = dgg, mapping = ggplot2::aes_string(x = "year", y = "val"))
+    ## Titles and scales
+    p <- p + facet_grid(panel ~ ., scale = "free") +
+    theme(legend.position = "none",
+          panel.grid.minor = element_blank(),
+          panel.grid.major.y = element_blank(),
+          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))  +
+    ylab("") + xlab("year") + ggtitle(titre) +
+    scale_colour_manual(values = col, name = "",
+                             breaks = names(col)) +
+    scale_x_continuous(breaks = min(dgg$year):max(dgg$year))
+    p <- p + ggplot2::geom_hline(data = hline_data, mapping = ggplot2::aes_string(yintercept = "z", colour = "couleur", linetype = "type"),
+                    alpha = 1, size = 1.2)
+    if (assess_ic) { ############# ONLY FOR THE CONFIDENCE INTERVAL
+        p <- p + ggplot2::geom_ribbon(mapping = ggplot2::aes_string(ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
+        p <- p + ggplot2::geom_pointrange(mapping = ggplot2::aes_string(y = "val", ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
+    }
+
+    p <- p + ggplot2::geom_line(mapping = ggplot2::aes_string(colour = "courbe"), size = 1.5)
+    p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "courbe"), size = 3)
+    alph <- ifelse(!is.na(dgg$catPoint), 1, 0)
+    p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "catPoint", alpha = alph), size = 2)
+    p <- p + ggplot2::geom_text(data = tab_text_pent, mapping = ggplot2::aes_string("x", "y", label = "txt"), parse = FALSE, color = col[vpan[1]], fontface = 2, size = 4)
+    ggplot2::ggsave(figname, p, width = 16, height = 15, units = "cm")
+}
+############################################################################################################ fin fonction graphique / end of function for graphical output
+
+################# Analysis
+
+for (sp in glmtable[, 1]) {
+
+    if (!all(is.na(glmtable[glmtable[, 1] == sp, 4:(length(glmtable) - 1)]))) { ##ignore lines with only NA
+        ggplot_glm(glmtable = glmtable, datatable = datatable, unitobs = unitobs, metric = metric, sp = sp, description = TRUE, trend_on_graph = TRUE, assess_ic = assess_ic)
+    }
+}
+
+sink("stdout.txt", split = TRUE)