view goprofiles.R @ 15:601027649251 draft default tip

"planemo upload commit 6bc2c32177acd823d5025bc90a562e6e6b2a233a"
author proteore
date Tue, 06 Apr 2021 16:59:30 +0000
parents 3ddc1f78773d
children
line wrap: on
line source

options(warn = -1)  #TURN OFF WARNINGS !!!!!!

# Load necessary libraries
suppressMessages(library(goProfiles, quietly = TRUE))

# Read file and return file content as data.frame
read_file <- function(path, header) {
  file <- try(read.csv(path, header = header,
    sep = "\t", stringsAsFactors = FALSE,
      quote = "\"", check.names = F), silent = TRUE)
  if (inherits(file, "try-error")) {
    stop("File not found !")
  }else{
    return(file)
  }
}

#convert a string to boolean
str2bool <- function(x) {
  if (any(is.element(c("t", "true"), tolower(x)))) {
    return(TRUE)
  }else if (any(is.element(c("f", "false"), tolower(x)))) {
    return(FALSE)
  }else{
    return(NULL)
  }
}

check_ids <- function(vector, type) {
 # nolint start
  uniprot_pattern <-
    "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
  entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
  # nolint end
  if (type == "Entrez") {
    return(grepl(entrez_id, vector))
  } else if (type == "UniProt") {
    return(grepl(uniprot_pattern, vector))
  }
}

getprofile <- function(ids, id_type, level, duplicate, species) {
  ####################################################################
  # Arguments
  #   - ids: list of input IDs
  #   - id_type: type of input IDs (UniProt/ENTREZID)
  #   - level
  #   - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE)
  #   - species
  ####################################################################

  library(species, character.only = TRUE, quietly = TRUE)

  if (species == "org.Hs.eg.db") {
    package <- org.Hs.eg.db #nolint
  } else if (species == "org.Mm.eg.db") {
    package <- org.Mm.eg.db #nolint
  } else if (species == "org.Rn.eg.db") {
    package <- org.Rn.eg.db #nolint
  }

  # Check if level is number
  if (! as.numeric(level) %% 1 == 0) {
    stop("Please enter an integer for level")
  } else {
    level <- as.numeric(level)
  }


  # Extract Gene Entrez ID
  if (id_type == "Entrez") {
    id <- select(package, ids, "ENTREZID", multiVals = "first") #nolint
  } else {
    id <- select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") #nolint
  }
  if (duplicate) {
    id <- unique(id)
  }
  genes_ids <- id$ENTREZID[which(!is.na(id$ENTREZID))]
  nas <- id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID #nolint

  # Create basic profiles
  profile.CC <-
    basicProfile(genes_ids, onto = "CC", level = level,
    orgPackage = species, empty.cats = F, ord = T, na.rm = T)
  profile.BP <-
    basicProfile(genes_ids, onto = "BP", level = level,
    orgPackage = species, empty.cats = F, ord = T, na.rm = T)
  profile.MF <-
    basicProfile(genes_ids, onto = "MF", level = level,
    orgPackage = species, empty.cats = F, ord = T, na.rm = T)
  profile.ALL <-
    basicProfile(genes_ids, onto = "ANY", level = level,
    orgPackage = species, empty.cats = F, ord = T, na.rm = T)

  return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
}

#return height and width of plot in inches from profile
plot_size_from_nb_onto <- function(profile) {
  width <- 10
  range <- seq(25, 2000, by = 25)
  names(range) <- seq(5, 242, by = 3)
  nb_onto <- round(nrow(profile[[1]]) / 25) * 25
  if (nb_onto < 25) {
      nb_onto <- 25
  }
  if (nb_onto <= 2000) {
    height <- as.integer(names(which(range == nb_onto)))
  } else {
    height <- 250
  }
  return(c(width, height))
}

make_plot <- function(profile, percent, title, onto, plot_opt) {

  tmp <- plot_size_from_nb_onto(profile)
  width <- tmp[1]
  height <- tmp[2]

  if (plot_opt == "PDF") {
    file_name <- paste("profile_", onto, ".pdf", collapse = "", sep = "")
    pdf(file_name, width = width, height = height)
  } else if (plot_opt == "JPEG") {
    file_name <- paste("profile_", onto, ".jpeg", collapse = "", sep = "")
    jpeg(file_name, width = width, height = height, units = "in", res = 100)
  } else if (plot_opt == "PNG") {
    file_name <- paste("profile_", onto, ".png", collapse = "", sep = "")
    png(file_name, width = width, height = height, units = "in", res = 100)
  }
  plotProfiles(profile, percentage = percent,
    multiplePlots = FALSE, aTitle = title)
  dev.off()
}

goprofiles <- function() {
  args <- commandArgs(TRUE)
  if (length(args) < 1) {
    args <- c("--help")
  }

  # Help section
  if ("--help" %in% args) {
    cat("Selection and Annotation HPA
    Arguments:
        --input_type: type of input (list of id or filename)
        --input: input
        --ncol: the column number which you would like to apply...
        --header: true/false if your file contains a header
        --id_type: the type of input IDs (UniProt/EntrezID)
        --onto_opt: ontology options
        --plot_opt: plot extension options (PDF/JPEG/PNG)
        --level: 1-3
        --per
        --title: title of the plot
        --duplicate: remove dupliate input IDs (true/false)
        --text_output: text output filename \n
        --species")
    q(save = "no")
  }

  # Parse arguments
  parse_args <- function(x) strsplit(sub("^--", "", x), "=")
  args_df <- as.data.frame(do.call("rbind", parse_args(args)))
  args <- as.list(as.character(args_df$V2))
  names(args) <- args_df$V1


  id_type <- args$id_type
  input_type <- args$input_type
  if (input_type == "text") {
    input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
  } else if (input_type == "file") {
    filename <- args$input
    ncol <- args$ncol
    # Check ncol
    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
      stop("Please enter an integer for level")
    } else {
      ncol <- as.numeric(gsub("c", "", ncol))
    }
    header <- str2bool(args$header)
    # Get file content
    file <- read_file(filename, header)
    # Extract Protein IDs list
    input <- unlist(strsplit(as.character(file[, ncol]), ";"))
  }
  input <- input [which(!is.na(gsub("NA", NA, input)))]

  if (! any(check_ids(input, id_type))) {
    stop(paste(id_type,
    "not found in your ids list, please check your IDs in input
     or the selected column of your input file"))
  }

  ontoopt <- strsplit(args$onto_opt, ",")[[1]]
  onto_pos <- as.integer(gsub("BP", 3, gsub("MF", 2, gsub("CC", 1, ontoopt))))
  plotopt <- args$plot_opt
  level <- args$level
  per <- as.logical(args$per)
  title <- args$title
  duplicate <- str2bool(args$duplicate)
  text_output <- args$text_output
  species <- args$species

  profiles <- getprofile(input, id_type, level, duplicate, species)

  for (index in onto_pos) {
    onto <- names(profiles[index])
    profile <- profiles[index]
    make_plot(profile, per, title, onto, plotopt)
    text_output <- paste("goProfiles_", onto, "_",
            title, ".tsv", sep = "", collapse = "")
    profile <- as.data.frame(profile)
    profile <- as.data.frame(apply(profile, c(1, 2),
                function(x) gsub("^$|^ $", NA, x)))  #convert "" and " " to NA
    write.table(profile, text_output, sep = "\t",
                row.names = FALSE, quote = FALSE, col.names = T)
  }
}

goprofiles()