Mercurial > repos > iuc > text_to_wordmatrix
comparison pubmed_by_queries.R @ 0:0692d11af909 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
| author | iuc |
|---|---|
| date | Wed, 24 Mar 2021 08:33:25 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0692d11af909 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 #tool: pubmed_by_queries | |
| 3 # | |
| 4 #This tool uses a set of search queries to download a defined number of abstracts or | |
| 5 #PMIDs for search query from PubMed. PubMed's search rules and syntax apply. | |
| 6 # | |
| 7 #Input: Tab-delimited table with search queries in a column starting with "ID_", | |
| 8 #e.g. "ID_gene" if search queries are genes. | |
| 9 # | |
| 10 #Output: Input table with additional columns | |
| 11 #with PMIDs or abstracts (--abstracts) from PubMed. | |
| 12 # | |
| 13 #Usage: | |
| 14 #$pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] | |
| 15 # | |
| 16 #optional arguments: | |
| 17 # -h, --help show this help message and exit | |
| 18 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
| 19 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] | |
| 20 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] | |
| 21 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts | |
| 22 # -k KEY, --key KEY if ncbi API key is available, add it to speed up the download of PubMed data. | |
| 23 # For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information). | |
| 24 | |
| 25 if ("--install_packages" %in% commandArgs()) { | |
| 26 print("Installing packages") | |
| 27 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/") ; | |
| 28 if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/") ; | |
| 29 } | |
| 30 | |
| 31 suppressPackageStartupMessages(library("argparse")) | |
| 32 suppressPackageStartupMessages(library("easyPubMed")) | |
| 33 | |
| 34 parser <- ArgumentParser() | |
| 35 parser$add_argument("-i", "--input", | |
| 36 help = "Input fie name. add path if file is not in working directory") | |
| 37 parser$add_argument("-o", "--output", default = "pubmed_by_queries_output", | |
| 38 help = "Output file name. [default \"%(default)s\"]") | |
| 39 parser$add_argument("-n", "--number", type = "integer", default = 5, | |
| 40 help = "Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") | |
| 41 parser$add_argument("-a", "--abstract", action = "store_true", default = FALSE, | |
| 42 help = "If abstracts instead of PMIDs should be retrieved use --abstracts ") | |
| 43 parser$add_argument("-k", "--key", type = "character", | |
| 44 help = "If ncbi API key is available, add it to speed up the download of PubMed data. For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).") | |
| 45 parser$add_argument("--install_packages", action = "store_true", default = FALSE, | |
| 46 help = "If you want to auto install missing required packages.") | |
| 47 args <- parser$parse_args() | |
| 48 | |
| 49 if (!is.null(args$key)) { | |
| 50 if (file.exists(args$key)) { | |
| 51 credentials <- read.table(args$key, quote = "\"", comment.char = "") | |
| 52 args$key <- credentials[1, 1] | |
| 53 } | |
| 54 } | |
| 55 | |
| 56 max_web_tries <- 100 | |
| 57 | |
| 58 data <- read.delim(args$input, stringsAsFactors = FALSE) | |
| 59 | |
| 60 id_col_index <- grep("ID_", names(data)) | |
| 61 | |
| 62 | |
| 63 fetch_pmids <- function(data, number, pubmed_search, query, row, max_web_tries) { | |
| 64 my_pubmed_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", | |
| 65 "db=pubmed&retmax=", number, | |
| 66 "&term=", pubmed_search$OriginalQuery, | |
| 67 "&usehistory=n", sep = "") | |
| 68 # get ids | |
| 69 idxml <- c() | |
| 70 for (i in seq(max_web_tries)) { | |
| 71 tryCatch({ | |
| 72 id_connect <- suppressWarnings(url(my_pubmed_url, open = "rb", encoding = "UTF8")) | |
| 73 idxml <- suppressWarnings(readLines(id_connect, warn = FALSE, encoding = "UTF8")) | |
| 74 suppressWarnings(close(id_connect)) | |
| 75 break | |
| 76 }, error = function(e) { | |
| 77 print(paste("Error getting URL, sleeping", 2 * i, "seconds.")) | |
| 78 print(e) | |
| 79 Sys.sleep(time = 2 * i) | |
| 80 }) | |
| 81 } | |
| 82 pmids <- c() | |
| 83 for (i in seq(length(idxml))) { | |
| 84 if (grepl("^<Id>", idxml[i])) { | |
| 85 pmid <- custom_grep(idxml[i], tag = "Id", format = "char") | |
| 86 pmids <- c(pmids, as.character(pmid[1])) | |
| 87 } | |
| 88 } | |
| 89 if (length(pmids) > 0) { | |
| 90 data[row, sapply(seq(length(pmids)), function(i) { | |
| 91 paste0("PMID_", i) | |
| 92 })] <- pmids | |
| 93 cat(length(pmids), " PMIDs for ", query, " are added in the table.", "\n") | |
| 94 } | |
| 95 return(data) | |
| 96 } | |
| 97 | |
| 98 | |
| 99 fetch_abstracts <- function(data, number, query, pubmed_search) { | |
| 100 efetch_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", | |
| 101 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, | |
| 102 "&retstart=", 0, "&retmax=", number, | |
| 103 "&rettype=", "null", "&retmode=", "xml", sep = "") | |
| 104 api_key <- pubmed_search$APIkey | |
| 105 if (!is.null(api_key)) { | |
| 106 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") | |
| 107 } | |
| 108 # initialize | |
| 109 out_data <- NULL | |
| 110 try_num <- 1 | |
| 111 t_0 <- Sys.time() | |
| 112 # Try to fetch results | |
| 113 while (is.null(out_data)) { | |
| 114 # Timing check: kill at 3 min | |
| 115 if (try_num > 1) { | |
| 116 Sys.sleep(time = 2 * try_num) | |
| 117 cat("Problem to receive PubMed data or error is received. Please wait. Try number:", | |
| 118 try_num, "\n") | |
| 119 } | |
| 120 t_1 <- Sys.time() | |
| 121 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) { | |
| 122 message("Killing the request! Something is not working. Please, try again later", | |
| 123 "\n") | |
| 124 return(data) | |
| 125 } | |
| 126 # ENTREZ server connect | |
| 127 out_data <- tryCatch({ | |
| 128 tmp_connect <- suppressWarnings(url(efetch_url, | |
| 129 open = "rb", | |
| 130 encoding = "UTF8")) | |
| 131 suppressWarnings(readLines(tmp_connect, | |
| 132 warn = FALSE, | |
| 133 encoding = "UTF8")) | |
| 134 }, error = function(e) { | |
| 135 print(e) | |
| 136 }, finally = { | |
| 137 try(suppressWarnings(close(tmp_connect)), | |
| 138 silent = TRUE) | |
| 139 }) | |
| 140 # Check if error | |
| 141 if (!is.null(out_data) && | |
| 142 class(out_data) == "character" && | |
| 143 grepl("<ERROR>", substr(paste(utils::head(out_data, n = 100), | |
| 144 collapse = ""), 1, 250))) { | |
| 145 out_data <- NULL | |
| 146 } | |
| 147 try_num <- try_num + 1 | |
| 148 } | |
| 149 if (is.null(out_data)) { | |
| 150 message("Killing the request! Something is not working. Please, try again later", | |
| 151 "\n") | |
| 152 return(data) | |
| 153 } else { | |
| 154 return(out_data) | |
| 155 } | |
| 156 } | |
| 157 | |
| 158 | |
| 159 process_xml_abstracts <- function(out_data) { | |
| 160 xml_data <- paste(out_data, collapse = "") | |
| 161 # articles to list | |
| 162 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
| 163 xml_data <- sapply(xml_data, function(x) { | |
| 164 #trim extra stuff at the end of the record | |
| 165 if (!grepl("</PubmedArticle>$", x)) | |
| 166 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
| 167 # Rebuid XML structure and proceed | |
| 168 x <- paste("<PubmedArticle>", x) | |
| 169 gsub("[[:space:]]{2,}", " ", x) | |
| 170 }, | |
| 171 USE.NAMES = FALSE, simplify = TRUE) | |
| 172 #titles | |
| 173 titles <- sapply(xml_data, function(x) { | |
| 174 x <- custom_grep(x, tag = "ArticleTitle", format = "char") | |
| 175 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 176 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 177 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 178 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 179 if (length(x) > 1) { | |
| 180 x <- paste(x, collapse = " ", sep = " ") | |
| 181 } else if (length(x) < 1) { | |
| 182 x <- NA | |
| 183 } | |
| 184 x | |
| 185 }, | |
| 186 USE.NAMES = FALSE, simplify = TRUE) | |
| 187 # abstracts | |
| 188 abstract_text <- sapply(xml_data, function(x) { | |
| 189 custom_grep(x, tag = "AbstractText", format = "char") | |
| 190 }, | |
| 191 USE.NAMES = FALSE, simplify = TRUE) | |
| 192 abstracts <- sapply(abstract_text, function(x) { | |
| 193 if (length(x) > 1) { | |
| 194 x <- paste(x, collapse = " ", sep = " ") | |
| 195 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 196 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 197 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 198 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 199 } else if (length(x) < 1) { | |
| 200 x <- NA | |
| 201 } else { | |
| 202 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 203 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 204 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 205 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 206 } | |
| 207 x | |
| 208 }, | |
| 209 USE.NAMES = FALSE, simplify = TRUE) | |
| 210 #add title to abstracts | |
| 211 if (length(titles) == length(abstracts)) { | |
| 212 abstracts <- paste(titles, abstracts) | |
| 213 } | |
| 214 return(abstracts) | |
| 215 } | |
| 216 | |
| 217 | |
| 218 pubmed_data_in_table <- function(data, row, query, number, key, abstract) { | |
| 219 if (is.null(query)) { | |
| 220 print(data) | |
| 221 } | |
| 222 pubmed_search <- get_pubmed_ids(query, api_key = key) | |
| 223 if (as.numeric(pubmed_search$Count) == 0) { | |
| 224 cat("No PubMed result for the following query: ", query, "\n") | |
| 225 return(data) | |
| 226 } else if (abstract == FALSE) { # fetch PMIDs | |
| 227 data <- fetch_pmids(data, number, pubmed_search, query, row, max_web_tries) | |
| 228 return(data) | |
| 229 } else if (abstract == TRUE) { # fetch abstracts and title text | |
| 230 out_data <- fetch_abstracts(data, number, query, pubmed_search) | |
| 231 abstracts <- process_xml_abstracts(out_data) | |
| 232 #add abstracts to data frame | |
| 233 if (length(abstracts) > 0) { | |
| 234 data[row, sapply(seq(length(abstracts)), | |
| 235 function(i) { | |
| 236 paste0("ABSTRACT_", i) | |
| 237 })] <- abstracts | |
| 238 cat(length(abstracts), " abstracts for ", query, " are added in the table.", | |
| 239 "\n") | |
| 240 } | |
| 241 return(data) | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 for (i in seq(nrow(data))) { | |
| 246 data <- tryCatch(pubmed_data_in_table(data = data, | |
| 247 row = i, | |
| 248 query = data[i, id_col_index], | |
| 249 number = args$number, | |
| 250 key = args$key, | |
| 251 abstract = args$abstract), error = function(e) { | |
| 252 print("main error") | |
| 253 print(e) | |
| 254 Sys.sleep(5) | |
| 255 }) | |
| 256 } | |
| 257 | |
| 258 write.table(data, args$output, append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) |
