text_to_wordmatrix: pubmed_by_queries.R comparison

comparison pubmed_by_queries.R @ 0:0692d11af909 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"

author	iuc
date	Wed, 24 Mar 2021 08:33:25 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:0692d11af909
+#!/usr/bin/env Rscript
+#tool: pubmed_by_queries
+#
+#This tool uses a set of search queries to download a defined number of abstracts or
+#PMIDs for search query from PubMed. PubMed's search rules and syntax apply.
+#
+#Input: Tab-delimited table with search queries in a column starting with "ID_",
+#e.g. "ID_gene" if search queries are genes.
+#
+#Output: Input table with additional columns
+#with PMIDs or abstracts (--abstracts) from PubMed.
+#
+#Usage:
+#$pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY]
+#
+#optional arguments:
+# -h, --help                  show this help message and exit
+# -i INPUT, --input INPUT     input file name. add path if file is not in working directory
+# -o OUTPUT, --output OUTPUT  output file name. [default "pubmed_by_queries_output"]
+# -n NUMBER, --number NUMBER  number of PMIDs or abstracts to save per ID [default "5"]
+# -a, --abstract              if abstracts instead of PMIDs should be retrieved use --abstracts
+# -k KEY, --key KEY           if ncbi API key is available, add it to speed up the download of PubMed data.
+# For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).
+if ("--install_packages" %in% commandArgs()) {
+print("Installing packages")
+if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/") ;
+if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/") ;
+}
+suppressPackageStartupMessages(library("argparse"))
+suppressPackageStartupMessages(library("easyPubMed"))
+parser <- ArgumentParser()
+parser$add_argument("-i", "--input",
+help = "Input fie name. add path if file is not in working directory")
+parser$add_argument("-o", "--output", default = "pubmed_by_queries_output",
+help = "Output file name. [default \"%(default)s\"]")
+parser$add_argument("-n", "--number", type = "integer", default = 5,
+help = "Number of PMIDs (or abstracts) to save per  ID. [default \"%(default)s\"]")
+parser$add_argument("-a", "--abstract", action = "store_true", default = FALSE,
+help = "If abstracts instead of PMIDs should be retrieved use --abstracts ")
+parser$add_argument("-k", "--key", type = "character",
+help = "If ncbi API key is available, add it to speed up the download of PubMed data. For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).")
+parser$add_argument("--install_packages", action = "store_true", default = FALSE,
+help = "If you want to auto install missing required packages.")
+args <- parser$parse_args()
+if (!is.null(args$key)) {
+if (file.exists(args$key)) {
+credentials <- read.table(args$key, quote = "\"", comment.char = "")
+args$key <- credentials[1, 1]
+}
+}
+max_web_tries <- 100
+data <- read.delim(args$input, stringsAsFactors = FALSE)
+id_col_index <- grep("ID_", names(data))
+fetch_pmids <- function(data, number, pubmed_search, query, row, max_web_tries) {
+my_pubmed_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
+"db=pubmed&retmax=", number,
+"&term=", pubmed_search$OriginalQuery,
+"&usehistory=n", sep = "")
+# get ids
+idxml <- c()
+for (i in seq(max_web_tries)) {
+tryCatch({
+id_connect <- suppressWarnings(url(my_pubmed_url, open = "rb", encoding = "UTF8"))
+idxml <- suppressWarnings(readLines(id_connect, warn = FALSE, encoding = "UTF8"))
+suppressWarnings(close(id_connect))
+break
+}, error = function(e) {
+print(paste("Error getting URL, sleeping", 2 * i, "seconds."))
+print(e)
+Sys.sleep(time = 2 * i)
+})
+}
+pmids <- c()
+for (i in seq(length(idxml))) {
+if (grepl("^<Id>", idxml[i])) {
+pmid <- custom_grep(idxml[i], tag = "Id", format = "char")
+pmids <- c(pmids, as.character(pmid[1]))
+}
+}
+if (length(pmids) > 0) {
+data[row, sapply(seq(length(pmids)), function(i) {
+paste0("PMID_", i)
+})] <- pmids
+cat(length(pmids), " PMIDs for ", query, " are added in the table.",  "\n")
+}
+return(data)
+}
+fetch_abstracts <- function(data, number, query, pubmed_search) {
+efetch_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?",
+"db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey,
+"&retstart=", 0, "&retmax=", number,
+"&rettype=", "null", "&retmode=", "xml", sep = "")
+api_key <- pubmed_search$APIkey
+if (!is.null(api_key)) {
+efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
+}
+# initialize
+out_data <- NULL
+try_num <- 1
+t_0 <- Sys.time()
+# Try to fetch results
+while (is.null(out_data)) {
+# Timing check: kill at 3 min
+if (try_num > 1) {
+Sys.sleep(time = 2 * try_num)
+cat("Problem to receive PubMed data or error is received. Please wait. Try number:",
+try_num, "\n")
+}
+t_1 <- Sys.time()
+if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) {
+message("Killing the request! Something is not working. Please, try again later",
+"\n")
+return(data)
+}
+# ENTREZ server connect
+out_data <- tryCatch({
+tmp_connect <- suppressWarnings(url(efetch_url,
+open = "rb",
+encoding = "UTF8"))
+suppressWarnings(readLines(tmp_connect,
+warn = FALSE,
+encoding = "UTF8"))
+}, error = function(e) {
+print(e)
+}, finally = {
+try(suppressWarnings(close(tmp_connect)),
+silent = TRUE)
+})
+# Check if error
+if (!is.null(out_data) &&
+class(out_data) == "character" &&
+grepl("<ERROR>", substr(paste(utils::head(out_data, n = 100),
+collapse = ""), 1, 250))) {
+out_data <- NULL
+}
+try_num <- try_num + 1
+}
+if (is.null(out_data)) {
+message("Killing the request! Something is not working. Please, try again later",
+"\n")
+return(data)
+} else {
+return(out_data)
+}
+}
+process_xml_abstracts <- function(out_data) {
+xml_data <- paste(out_data, collapse = "")
+# articles to list
+xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
+xml_data <- sapply(xml_data, function(x) {
+#trim extra stuff at the end of the record
+if (!grepl("</PubmedArticle>$", x))
+x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
+# Rebuid XML structure and proceed
+x <- paste("<PubmedArticle>", x)
+gsub("[[:space:]]{2,}", " ", x)
+},
+USE.NAMES = FALSE, simplify = TRUE)
+#titles
+titles <- sapply(xml_data, function(x) {
+x <- custom_grep(x, tag = "ArticleTitle", format = "char")
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+if (length(x) > 1) {
+x <- paste(x, collapse = " ", sep = " ")
+} else if (length(x) < 1) {
+x <- NA
+}
+x
+},
+USE.NAMES = FALSE, simplify = TRUE)
+# abstracts
+abstract_text <- sapply(xml_data, function(x) {
+custom_grep(x, tag = "AbstractText", format = "char")
+},
+USE.NAMES = FALSE, simplify = TRUE)
+abstracts <- sapply(abstract_text, function(x) {
+if (length(x) > 1) {
+x <- paste(x, collapse = " ", sep = " ")
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+} else if (length(x) < 1) {
+x <- NA
+} else {
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+}
+x
+},
+USE.NAMES = FALSE, simplify = TRUE)
+#add title to abstracts
+if (length(titles) == length(abstracts)) {
+abstracts <- paste(titles,  abstracts)
+}
+return(abstracts)
+}
+pubmed_data_in_table <- function(data, row, query, number, key, abstract) {
+if (is.null(query)) {
+print(data)
+}
+pubmed_search <- get_pubmed_ids(query, api_key = key)
+if (as.numeric(pubmed_search$Count) == 0) {
+cat("No PubMed result for the following query: ", query, "\n")
+return(data)
+} else if (abstract == FALSE) { # fetch PMIDs
+data <- fetch_pmids(data, number, pubmed_search, query, row, max_web_tries)
+return(data)
+} else if (abstract == TRUE) { # fetch abstracts and title text
+out_data <- fetch_abstracts(data, number, query, pubmed_search)
+abstracts <- process_xml_abstracts(out_data)
+#add abstracts to data frame
+if (length(abstracts) > 0) {
+data[row, sapply(seq(length(abstracts)),
+function(i) {
+paste0("ABSTRACT_", i)
+})] <- abstracts
+cat(length(abstracts), " abstracts for ", query, " are added in the table.",
+"\n")
+}
+return(data)
+}
+}
+for (i in seq(nrow(data))) {
+data <- tryCatch(pubmed_data_in_table(data = data,
+row = i,
+query = data[i, id_col_index],
+number = args$number,
+key = args$key,
+abstract = args$abstract), error = function(e) {
+print("main error")
+print(e)
+Sys.sleep(5)
+})
+}
+write.table(data, args$output, append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

Mercurial > repos > iuc > text_to_wordmatrix

comparison pubmed_by_queries.R @ 0:0692d11af909 draft default tip