view GO-enrich.R @ 13:f6107b8ae8f8 draft default tip

"planemo upload commit be9070db4a3b178ab45ecd9c9c3ab369240f7617-dirty"
author proteore
date Fri, 09 Apr 2021 14:39:05 +0000
parents d951677a50d4
children
line wrap: on
line source

options(warn = -1)  #TURN OFF WARNINGS !!!!!!
suppressMessages(library(clusterProfiler, quietly = TRUE))

# Read file and return file content as data.frame
read_file <- function(path, header) {
  file <- try(read.csv(path, header = header, sep = "\t",
                       stringsAsFactors = FALSE, quote = "", check.names = F),
                      silent = TRUE)
  if (inherits(file, "try-error")) {
    stop("File not found !")
  }else{
    file <- file[!apply(is.na(file) | file == "", 1, all), , drop = FALSE]
    return(file)
  }
}


#return the number of character from the longest description found
#(from the 10 first)
max_str_length_10_first <- function(vector) {
  vector <- as.vector(vector)
  nb_description <- length(vector)
  if (nb_description >= 10) {
    nb_description <- 10
    }

  return(max(nchar(vector[1:nb_description])))
}

str2bool <- function(x) {
  if (any(is.element(c("t", "true"), tolower(x)))) {
    return(TRUE)
  }else if (any(is.element(c("f", "false"), tolower(x)))) {
    return(FALSE)

  }else{
    return(NULL)
  }
}

#used before the limit was set to 50 characters
width_by_max_char <- function(nb_max_char) {
  if (nb_max_char < 50) {
    width <- 600
  } else if (nb_max_char < 75) {
    width <- 800
  } else if (nb_max_char < 100) {
    width <- 900
  } else {
    width <- 1000
  }
  return(width)
}


repartition_go <- function(geneid, orgdb, ontology,
                           level = 3, readable = TRUE) {
  ggo <- groupGO(gene = geneid,
                OrgDb = orgdb,
                ont = ontology,
                level = level,
                readable = TRUE)


  if (length(ggo@result$ID) > 0) {
    ggo@result$Description <- sapply(as.vector(ggo@result$Description), #nolint
                              function(x) {
                                ifelse(nchar(x) > 50,
                                substr(x, 1, 50), x)}, USE.NAMES = FALSE)


    name <- paste("GGO_", ontology, "_bar-plot", sep = "")
    png(name, height = 720, width = 600)
    p <- barplot(ggo, showCategory = 10)
    print(p)
    dev.off()
    ggo <- as.data.frame(ggo)
    return(ggo)
  }
}

# nolint end

# GO over-representation test
enrich_go <- function(geneid, universe, orgdb, ontology, pval_cutoff,
                      qval_cutoff, plot) {
  ego <- enrichGO(gene = geneid,

                universe = universe,
                OrgDb = orgdb,
                ont = ontology,
                pAdjustMethod = "BH",
                pvalueCutoff = pval_cutoff,
                qvalueCutoff = qval_cutoff,
                readable = TRUE)

  # Plot bar & dot plots
  #if there are enriched GoTerms



  if (length(ego$ID) > 0) {

    ego@result$Description <- sapply(ego@result$Description, function(x) {  #nolint
      ifelse(nchar(x) > 50, substr(x, 1, 50), x)
      }, USE.NAMES = FALSE)


    if ("dotplot" %in% plot) {
    dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
    png(dot_name, height = 720, width = 600)
    p <- dotplot(ego, showCategory = 10)  #nolint
    print(p)
    dev.off()
    }

    if ("barplot" %in% plot) {
    bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
    png(bar_name, height = 720, width = 600)
    p <- barplot(ego, showCategory = 10)
    print(p)
    dev.off()
    }

    ego <- as.data.frame(ego)
    return(ego)
  } else {
    warning(paste("No Go terms enriched (EGO) found for ",
    ontology, "ontology"), immediate. = TRUE, noBreaks. = TRUE, call. = FALSE)
  }
}


clean_ids <- function(ids) {
  ids <- gsub(" ", "", ids)
  ids <- ids[which(ids != "")]
  ids <- ids[which(ids != "NA")]
  ids <- ids[!is.na(ids)]

  return(ids)
}

check_ids <- function(vector, type) {
  uniprot_pattern <- "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" #nolint
  entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" #nolint
  if (type == "entrez") {
    return(grepl(entrez_id, vector))
  }else if (type == "uniprot") {
    return(grepl(uniprot_pattern, vector))
  }
}

get_args <- function() {
  args <- commandArgs(TRUE)
  if (length(args) < 1) {
    args <- c("--help")
  }

  # Help section
  if ("--help" %in% args) {
    cat("clusterProfiler Enrichment Analysis
    Arguments:
        --input_type: type of input (list of id or filename)
        --input: input
        --ncol: the column number which contains list of input IDs
        --header: true/false if your file contains a header
        --id_type: the type of input IDs (UniProt/EntrezID)
        --universe_type: list or filename
        --universe: background IDs list
        --uncol: the column number which contains background IDs list
        --uheader: true/false if the background IDs file contains header
        --universe_id_type: the type of universe IDs (UniProt/EntrezID)
        --species
        --onto_opt: ontology options
        --go_function: groupGO/enrichGO
        --level: 1-3
        --pval_cutoff
        --qval_cutoff
        --text_output: text output filename
        --plot : type of visualization, dotplot or/and barplot \n")
    q(save = "no")
  }
  # Parse arguments

  parseargs <- function(x) strsplit(sub("^--", "", x), "=")
  argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
  args <- as.list(as.character(argsdf$V2))
  names(args) <- argsdf$V1
  return(args)
}

main <- function() {  #nolint
  #get args from command
  args <- get_args()

  go_represent <- str2bool(args$go_represent)
  go_enrich <- str2bool(args$go_enrich)
  if (go_enrich) {
    plot <- unlist(strsplit(args$plot, ","))
  }

  suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))

  # Extract OrgDb
  if (args$species == "org.Hs.eg.db") {
    orgdb <- org.Hs.eg.db #nolint
  } else if (args$species == "org.Mm.eg.db") {
    orgdb <- org.Mm.eg.db #nolint
  } else if (args$species == "org.Rn.eg.db") {
    orgdb <- org.Rn.eg.db #nolint
  }

  # Extract input IDs
  input_type <- args$input_type
  id_type <- args$id_type

  if (input_type == "text") {
    input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
  } else if (input_type == "file") {
    filename <- args$input
    ncol <- args$ncol
    # Check ncol
    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
      stop("Please enter the right format for column number: c[number]")
    } else {
      ncol <- as.numeric(gsub("c", "", ncol))
    }

    header <- str2bool(args$header)       # Get file content
    file <- read_file(filename, header)   # Extract Protein IDs list
    input <-  unlist(sapply(as.character(file[, ncol]),
                    function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
  }
  input <- clean_ids(input)

  ## Get input gene list from input IDs
  #ID format Conversion
  #This case : from UNIPROT (protein id) to ENTREZ (gene id)
  #bitr = conversion function from clusterProfiler
  if (id_type == "Uniprot" & any(check_ids(input, "uniprot"))) {
    any(check_ids(input, "uniprot"))

    idfrom <- "UNIPROT"
    idto <- "ENTREZID"
    suppressMessages(gene <- bitr(input, fromType = idfrom, toType = idto,
                                  OrgDb = orgdb))

    gene <- unique(gene$ENTREZID)
  } else if (id_type == "Entrez" & any(check_ids(input, "entrez"))) {
    gene <- unique(input)
  } else {

    stop(paste(id_type, "not found in your ids list,
     please check your IDs in input or the selected column of your input file"))

  }

  ontology <- strsplit(args$onto_opt, ",")[[1]]

  ## Extract GGO/EGO arguments
  if (go_represent) {
    level <- as.numeric(args$level)

    }

  if (go_enrich) {
    pval_cutoff <- as.numeric(args$pval_cutoff)
    qval_cutoff <- as.numeric(args$qval_cutoff)
    # Extract universe background genes (same as input file)
    if (!is.null(args$universe_type)) {
      universe_type <- args$universe_type
      if (universe_type == "text") {
        universe <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
      } else if (universe_type == "file") {
        universe_filename <- args$universe
        universe_ncol <- args$uncol
        # Check ncol
        if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
          stop("Please enter the right format for column number: c[number]")
        } else {
          universe_ncol <- as.numeric(gsub("c", "", universe_ncol))
        }
        universe_header <- str2bool(args$uheader)
        # Get file content
        universe_file <- read_file(universe_filename, universe_header)
        # Extract Protein IDs list
        universe <- unlist(sapply(universe_file[, universe_ncol],
                function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
      }
      universe <- clean_ids(input)
      universe_id_type <- args$universe_id_type
      ##to initialize
      if (universe_id_type == "Uniprot" & any(check_ids(universe, "uniprot"))) {
        idfrom <- "UNIPROT"
        idto <- "ENTREZID"
        suppressMessages(universe_gene <- bitr(universe, fromType = idfrom,
                                               toType = idto, OrgDb = orgdb))
        universe_gene <- unique(universe_gene$ENTREZID)
      } else if (universe_id_type == "Entrez" &
                 any(check_ids(universe, "entrez"))) {
        universe_gene <- unique(unlist(universe))
      } else {
        if (universe_type == "text") {
          print(paste(universe_id_type, "not found in your background IDs list",
                      sep = " "))
        } else {
          print(paste(universe_id_type, "not found in the column",
                      universe_ncol, "of your background IDs file", sep = " "))

        }
        universe_gene <- NULL
      }
    } else {
      universe_gene <- NULL
    }
  } else {
    universe_gene <- NULL
  }

  ##enrichGO : GO over-representation test
  for (onto in ontology) {
    if (go_represent) {
      ggo <- repartition_go(gene, orgdb, onto, level, readable = TRUE)
      if (is.list(ggo)) {
        ggo <- as.data.frame(apply(ggo, c(1, 2),
              function(x) gsub("^$|^ $", NA, x)))
      }  #convert "" and " " to NA
      output_path <- paste("cluster_profiler_GGO_", onto, ".tsv", sep = "")
      write.table(ggo, output_path, sep = "\t",
                  row.names = FALSE, quote = FALSE)

    }

    if (go_enrich) {
      ego <- enrich_go(gene, universe_gene, orgdb, onto, pval_cutoff,
                       qval_cutoff, plot)
      if (is.list(ego)) {
        ego <- as.data.frame(apply(ego, c(1, 2),
                function(x) gsub("^$|^ $", NA, x)))
        }  #convert "" and " " to NA
      output_path <- paste("cluster_profiler_EGO_", onto, ".tsv", sep = "")
      write.table(ego, output_path, sep = "\t",
                  row.names = FALSE, quote = FALSE)
    }
  }
}

if (!interactive()) {
  main()
}