Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix
view pubmed_by_queries.R @ 0:3f4adc85ba5d draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:01:50 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env Rscript #tool: pubmed_by_queries # #This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed. #PubMed's search rules and syntax apply. # #Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes. # # Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed. # #Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] # # optional arguments: # -h, --help show this help message and exit # -i INPUT, --input INPUT input file name. add path if file is not in working directory # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts # -k KEY, --key KEY if NCBI API key is available, add it to speed up the fetching of pubmed data if ( '--install_packages' %in% commandArgs()) { print('Installing packages') if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/"); } suppressPackageStartupMessages(library("argparse")) suppressPackageStartupMessages(library("easyPubMed")) parser <- ArgumentParser() parser$add_argument("-i", "--input", help = "input fie name. add path if file is not in working directory") parser$add_argument("-o", "--output", default="pubmed_by_queries_output", help = "output file name. [default \"%(default)s\"]") parser$add_argument("-n", "--number", type="integer", default=5, help="Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") parser$add_argument("-a", "--abstract", action="store_true", default=FALSE, help="if abstracts instead of PMIDs should be retrieved use --abstracts ") parser$add_argument("-k", "--key", type="character", help="if ncbi API key is available, add it to speed up the download of pubmed data") parser$add_argument("--install_packages", action="store_true", default=FALSE, help="If you want to auto install missing required packages.") args <- parser$parse_args() MAX_WEB_TRIES = 100 data = read.delim(args$input, stringsAsFactors=FALSE) id_col_index <- grep("ID_", names(data)) pubmed_data_in_table <- function(data, row, query, number, key, abstract){ if (is.null(query)){print(data)} pubmed_search <- get_pubmed_ids(query, api_key = key) if(as.numeric(pubmed_search$Count) == 0){ cat("No PubMed result for the following query: ", query, "\n") return(data) } else if (abstract == FALSE) { # fetch PMIDs myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "") # get ids idXML <- c() for (i in 1:MAX_WEB_TRIES){ tryCatch({ IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8")) idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8")) suppressWarnings(close(IDconnect)) break }, error = function(e) { print(paste('Error getting URL, sleeping',2*i,'seconds.')) print(e) Sys.sleep(time = 2*i) }) } PMIDs = c() for (i in 1:length(idXML)) { if (grepl("^<Id>", idXML[i])) { pmid <- custom_grep(idXML[i], tag = "Id", format = "char") PMIDs <- c(PMIDs, as.character(pmid[1])) } } if(length(PMIDs)>0){ data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs cat(length(PMIDs)," PMIDs for ",query, " are added in the table.", "\n") } return(data) } else if (abstract == TRUE) { # fetch abstracts and title text efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, "&retstart=", 0, "&retmax=", number, "&rettype=", "null","&retmode=", "xml", sep = "") api_key <- pubmed_search$APIkey if (!is.null(api_key)) { efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") } # initialize out.data <- NULL try_num <- 1 t_0 <- Sys.time() # Try to fetch results while(is.null(out.data)) { # Timing check: kill at 3 min if (try_num > 1){ Sys.sleep(time = 2*try_num) cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n") } t_1 <- Sys.time() if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ message("Killing the request! Something is not working. Please, try again later","\n") return(data) } # ENTREZ server connect out.data <- tryCatch({ tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8")) suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8")) }, error = function(e) { print(e) }, finally = { try(suppressWarnings(close(tmpConnect)), silent = TRUE) }) # Check if error if (!is.null(out.data) && class(out.data) == "character" && grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) { out.data <- NULL } try_num <- try_num + 1 } if (is.null(out.data)) { message("Killing the request! Something is not working. Please, try again later","\n") return(data) } # process xml data xml_data <- paste(out.data, collapse = "") # articles to list xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] xml_data <- sapply(xml_data, function(x) { #trim extra stuff at the end of the record if (!grepl("</PubmedArticle>$", x)) x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) # Rebuid XML structure and proceed x <- paste("<PubmedArticle>", x) gsub("[[:space:]]{2,}", " ", x)}, USE.NAMES = FALSE, simplify = TRUE) #titles titles = sapply(xml_data, function(x){ x = custom_grep(x, tag="ArticleTitle", format="char") x <- gsub("</{0,1}i>", "", x, ignore.case = T) x <- gsub("</{0,1}b>", "", x, ignore.case = T) x <- gsub("</{0,1}sub>", "", x, ignore.case = T) x <- gsub("</{0,1}exp>", "", x, ignore.case = T) if (length(x) > 1){ x <- paste(x, collapse = " ", sep = " ") } else if (length(x) < 1) { x <- NA } x }, USE.NAMES = FALSE, simplify = TRUE) # abstracts abstract.text = sapply(xml_data, function(x){ custom_grep(x, tag="AbstractText", format="char")}, USE.NAMES = FALSE, simplify = TRUE) abstracts <- sapply(abstract.text, function(x){ if (length(x) > 1){ x <- paste(x, collapse = " ", sep = " ") x <- gsub("</{0,1}i>", "", x, ignore.case = T) x <- gsub("</{0,1}b>", "", x, ignore.case = T) x <- gsub("</{0,1}sub>", "", x, ignore.case = T) x <- gsub("</{0,1}exp>", "", x, ignore.case = T) } else if (length(x) < 1) { x <- NA } else { x <- gsub("</{0,1}i>", "", x, ignore.case = T) x <- gsub("</{0,1}b>", "", x, ignore.case = T) x <- gsub("</{0,1}sub>", "", x, ignore.case = T) x <- gsub("</{0,1}exp>", "", x, ignore.case = T) } x }, USE.NAMES = FALSE, simplify = TRUE) #add title to abstracts if (length(titles) == length(abstracts)){ abstracts = paste(titles, abstracts) } #add abstracts to data frame if(length(abstracts)>0){ data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts cat(length(abstracts)," abstracts for ",query, " are added in the table.", "\n") } return(data) } } for(i in 1:nrow(data)){ data = tryCatch(pubmed_data_in_table(data= data, row= i, query= data[i,id_col_index], number= args$number, key= args$key, abstract= args$abstract), error=function(e){ print('main error') print(e) Sys.sleep(5) }) } write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)