Mercurial > repos > iuc > abstracts_by_pmids
diff pubmed_by_queries.R @ 0:ff904894ccaa draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author | iuc |
---|---|
date | Wed, 24 Mar 2021 08:32:54 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pubmed_by_queries.R Wed Mar 24 08:32:54 2021 +0000 @@ -0,0 +1,258 @@ +#!/usr/bin/env Rscript +#tool: pubmed_by_queries +# +#This tool uses a set of search queries to download a defined number of abstracts or +#PMIDs for search query from PubMed. PubMed's search rules and syntax apply. +# +#Input: Tab-delimited table with search queries in a column starting with "ID_", +#e.g. "ID_gene" if search queries are genes. +# +#Output: Input table with additional columns +#with PMIDs or abstracts (--abstracts) from PubMed. +# +#Usage: +#$pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] +# +#optional arguments: +# -h, --help show this help message and exit +# -i INPUT, --input INPUT input file name. add path if file is not in working directory +# -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] +# -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] +# -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts +# -k KEY, --key KEY if ncbi API key is available, add it to speed up the download of PubMed data. +# For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information). + +if ("--install_packages" %in% commandArgs()) { + print("Installing packages") + if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/") ; + if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/") ; +} + +suppressPackageStartupMessages(library("argparse")) +suppressPackageStartupMessages(library("easyPubMed")) + +parser <- ArgumentParser() +parser$add_argument("-i", "--input", + help = "Input fie name. add path if file is not in working directory") +parser$add_argument("-o", "--output", default = "pubmed_by_queries_output", + help = "Output file name. [default \"%(default)s\"]") +parser$add_argument("-n", "--number", type = "integer", default = 5, + help = "Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") +parser$add_argument("-a", "--abstract", action = "store_true", default = FALSE, + help = "If abstracts instead of PMIDs should be retrieved use --abstracts ") +parser$add_argument("-k", "--key", type = "character", + help = "If ncbi API key is available, add it to speed up the download of PubMed data. For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).") +parser$add_argument("--install_packages", action = "store_true", default = FALSE, + help = "If you want to auto install missing required packages.") +args <- parser$parse_args() + +if (!is.null(args$key)) { + if (file.exists(args$key)) { + credentials <- read.table(args$key, quote = "\"", comment.char = "") + args$key <- credentials[1, 1] + } +} + +max_web_tries <- 100 + +data <- read.delim(args$input, stringsAsFactors = FALSE) + +id_col_index <- grep("ID_", names(data)) + + +fetch_pmids <- function(data, number, pubmed_search, query, row, max_web_tries) { + my_pubmed_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", + "db=pubmed&retmax=", number, + "&term=", pubmed_search$OriginalQuery, + "&usehistory=n", sep = "") + # get ids + idxml <- c() + for (i in seq(max_web_tries)) { + tryCatch({ + id_connect <- suppressWarnings(url(my_pubmed_url, open = "rb", encoding = "UTF8")) + idxml <- suppressWarnings(readLines(id_connect, warn = FALSE, encoding = "UTF8")) + suppressWarnings(close(id_connect)) + break + }, error = function(e) { + print(paste("Error getting URL, sleeping", 2 * i, "seconds.")) + print(e) + Sys.sleep(time = 2 * i) + }) + } + pmids <- c() + for (i in seq(length(idxml))) { + if (grepl("^<Id>", idxml[i])) { + pmid <- custom_grep(idxml[i], tag = "Id", format = "char") + pmids <- c(pmids, as.character(pmid[1])) + } + } + if (length(pmids) > 0) { + data[row, sapply(seq(length(pmids)), function(i) { + paste0("PMID_", i) + })] <- pmids + cat(length(pmids), " PMIDs for ", query, " are added in the table.", "\n") + } + return(data) +} + + +fetch_abstracts <- function(data, number, query, pubmed_search) { + efetch_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", + "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, + "&retstart=", 0, "&retmax=", number, + "&rettype=", "null", "&retmode=", "xml", sep = "") + api_key <- pubmed_search$APIkey + if (!is.null(api_key)) { + efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") + } + # initialize + out_data <- NULL + try_num <- 1 + t_0 <- Sys.time() + # Try to fetch results + while (is.null(out_data)) { + # Timing check: kill at 3 min + if (try_num > 1) { + Sys.sleep(time = 2 * try_num) + cat("Problem to receive PubMed data or error is received. Please wait. Try number:", + try_num, "\n") + } + t_1 <- Sys.time() + if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) { + message("Killing the request! Something is not working. Please, try again later", + "\n") + return(data) + } + # ENTREZ server connect + out_data <- tryCatch({ + tmp_connect <- suppressWarnings(url(efetch_url, + open = "rb", + encoding = "UTF8")) + suppressWarnings(readLines(tmp_connect, + warn = FALSE, + encoding = "UTF8")) + }, error = function(e) { + print(e) + }, finally = { + try(suppressWarnings(close(tmp_connect)), + silent = TRUE) + }) + # Check if error + if (!is.null(out_data) && + class(out_data) == "character" && + grepl("<ERROR>", substr(paste(utils::head(out_data, n = 100), + collapse = ""), 1, 250))) { + out_data <- NULL + } + try_num <- try_num + 1 + } + if (is.null(out_data)) { + message("Killing the request! Something is not working. Please, try again later", + "\n") + return(data) + } else { + return(out_data) + } +} + + +process_xml_abstracts <- function(out_data) { + xml_data <- paste(out_data, collapse = "") + # articles to list + xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] + xml_data <- sapply(xml_data, function(x) { + #trim extra stuff at the end of the record + if (!grepl("</PubmedArticle>$", x)) + x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) + # Rebuid XML structure and proceed + x <- paste("<PubmedArticle>", x) + gsub("[[:space:]]{2,}", " ", x) + }, + USE.NAMES = FALSE, simplify = TRUE) + #titles + titles <- sapply(xml_data, function(x) { + x <- custom_grep(x, tag = "ArticleTitle", format = "char") + x <- gsub("</{0,1}i>", "", x, ignore.case = T) + x <- gsub("</{0,1}b>", "", x, ignore.case = T) + x <- gsub("</{0,1}sub>", "", x, ignore.case = T) + x <- gsub("</{0,1}exp>", "", x, ignore.case = T) + if (length(x) > 1) { + x <- paste(x, collapse = " ", sep = " ") + } else if (length(x) < 1) { + x <- NA + } + x + }, + USE.NAMES = FALSE, simplify = TRUE) + # abstracts + abstract_text <- sapply(xml_data, function(x) { + custom_grep(x, tag = "AbstractText", format = "char") + }, + USE.NAMES = FALSE, simplify = TRUE) + abstracts <- sapply(abstract_text, function(x) { + if (length(x) > 1) { + x <- paste(x, collapse = " ", sep = " ") + x <- gsub("</{0,1}i>", "", x, ignore.case = T) + x <- gsub("</{0,1}b>", "", x, ignore.case = T) + x <- gsub("</{0,1}sub>", "", x, ignore.case = T) + x <- gsub("</{0,1}exp>", "", x, ignore.case = T) + } else if (length(x) < 1) { + x <- NA + } else { + x <- gsub("</{0,1}i>", "", x, ignore.case = T) + x <- gsub("</{0,1}b>", "", x, ignore.case = T) + x <- gsub("</{0,1}sub>", "", x, ignore.case = T) + x <- gsub("</{0,1}exp>", "", x, ignore.case = T) + } + x + }, + USE.NAMES = FALSE, simplify = TRUE) + #add title to abstracts + if (length(titles) == length(abstracts)) { + abstracts <- paste(titles, abstracts) + } + return(abstracts) +} + + +pubmed_data_in_table <- function(data, row, query, number, key, abstract) { + if (is.null(query)) { + print(data) + } + pubmed_search <- get_pubmed_ids(query, api_key = key) + if (as.numeric(pubmed_search$Count) == 0) { + cat("No PubMed result for the following query: ", query, "\n") + return(data) + } else if (abstract == FALSE) { # fetch PMIDs + data <- fetch_pmids(data, number, pubmed_search, query, row, max_web_tries) + return(data) + } else if (abstract == TRUE) { # fetch abstracts and title text + out_data <- fetch_abstracts(data, number, query, pubmed_search) + abstracts <- process_xml_abstracts(out_data) + #add abstracts to data frame + if (length(abstracts) > 0) { + data[row, sapply(seq(length(abstracts)), + function(i) { + paste0("ABSTRACT_", i) + })] <- abstracts + cat(length(abstracts), " abstracts for ", query, " are added in the table.", + "\n") + } + return(data) + } +} + +for (i in seq(nrow(data))) { + data <- tryCatch(pubmed_data_in_table(data = data, + row = i, + query = data[i, id_col_index], + number = args$number, + key = args$key, + abstract = args$abstract), error = function(e) { + print("main error") + print(e) + Sys.sleep(5) + }) +} + +write.table(data, args$output, append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)