Mercurial > repos > dlalgroup > simtext_app
view abstracts_by_pmids.R @ 1:429b1df6b7a9 draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 04:32:14 +0000 |
parents | 34ed44f3f85c |
children |
line wrap: on
line source
#!/usr/bin/env Rscript #TOOL2 abstracts_by_pmids # #This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns. # #Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc. # #Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed. #The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc. # # Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT] # # optional arguments: # -h, --help show help message # -i INPUT, --input INPUT input file name. add path if file is not in working directory # -o OUTPUT, --output OUTPUT output file name. [default "T2_output"] if ( '--install_packages' %in% commandArgs()) { print('Installing packages') if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/"); if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/"); if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" ); if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/"); } suppressPackageStartupMessages(library("argparse")) library("reutils") suppressPackageStartupMessages(library("easyPubMed")) suppressPackageStartupMessages(library("textclean")) parser <- ArgumentParser() parser$add_argument("-i", "--input", help = "input fie name. add path if file is not in workind directory") parser$add_argument("-o", "--output", default="abstracts_by_pmids_output", help = "output file name. [default \"%(default)s\"]") parser$add_argument("--install_packages", action="store_true", default=FALSE, help="If you want to auto install missing required packages.") args <- parser$parse_args() data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t') pmids_cols_index <- grep("PMID", names(data)) fetch_abstracts = function(PMIDs, row){ efetch_result <- NULL try_num <- 1 t_0 <- Sys.time() while(is.null(efetch_result)) { # Timing check: kill at 3 min if (try_num > 1){ Sys.sleep(time = 1*try_num) cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n") } t_1 <- Sys.time() if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ message("Killing the request! Something is not working. Please, try again later","\n") return(data) } efetch_result <- tryCatch({ suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml")) }, error = function(e) { NULL }) if(!is.null(as.list(efetch_result$errors)$error)){ if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") { efetch_result <- NULL } } try_num <- try_num + 1 } #while loop end # articles to list xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] xml_data <- sapply(xml_data, function(x) { #trim extra stuff at the end of the record if (!grepl("</PubmedArticle>$", x)) x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) # Rebuid XML structure and proceed x <- paste("<PubmedArticle>", x) gsub("[[:space:]]{2,}", " ", x)}, USE.NAMES = FALSE, simplify = TRUE) abstract.text = sapply(xml_data, function(x){ custom_grep(x, tag="AbstractText", format="char")}, USE.NAMES = FALSE, simplify = TRUE) abstracts <- sapply(abstract.text, function(x){ if (length(x) > 1){ x <- paste(x, collapse = " ", sep = " ") x <- gsub("</{0,1}i>", "", x, ignore.case = T) x <- gsub("</{0,1}b>", "", x, ignore.case = T) x <- gsub("</{0,1}sub>", "", x, ignore.case = T) x <- gsub("</{0,1}exp>", "", x, ignore.case = T) } else if (length(x) < 1) { x <- NA } else { x <- gsub("</{0,1}i>", "", x, ignore.case = T) x <- gsub("</{0,1}b>", "", x, ignore.case = T) x <- gsub("</{0,1}sub>", "", x, ignore.case = T) x <- gsub("</{0,1}exp>", "", x, ignore.case = T) } x }, USE.NAMES = FALSE, simplify = TRUE) abstracts = as.character(abstracts) if(length(abstracts)>0){ data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n") } return(data) } for(row in 1:nrow(data)){ PMIDs= as.character(unique(data[row, pmids_cols_index])) PMIDs = PMIDs[!PMIDs=="NA"] if(length(PMIDs) > 0){ data = tryCatch(fetch_abstracts(PMIDs, row), error=function(e){ Sys.sleep(3) }) } else { print(paste("No PMIDs in row", row)) } } write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE)