comparison pubmed_by_queries.R @ 0:02e46a96e98a draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author iuc
date Wed, 24 Mar 2021 08:34:22 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:02e46a96e98a
1 #!/usr/bin/env Rscript
2 #tool: pubmed_by_queries
3 #
4 #This tool uses a set of search queries to download a defined number of abstracts or
5 #PMIDs for search query from PubMed. PubMed's search rules and syntax apply.
6 #
7 #Input: Tab-delimited table with search queries in a column starting with "ID_",
8 #e.g. "ID_gene" if search queries are genes.
9 #
10 #Output: Input table with additional columns
11 #with PMIDs or abstracts (--abstracts) from PubMed.
12 #
13 #Usage:
14 #$pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY]
15 #
16 #optional arguments:
17 # -h, --help show this help message and exit
18 # -i INPUT, --input INPUT input file name. add path if file is not in working directory
19 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"]
20 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"]
21 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts
22 # -k KEY, --key KEY if ncbi API key is available, add it to speed up the download of PubMed data.
23 # For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).
24
25 if ("--install_packages" %in% commandArgs()) {
26 print("Installing packages")
27 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/") ;
28 if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/") ;
29 }
30
31 suppressPackageStartupMessages(library("argparse"))
32 suppressPackageStartupMessages(library("easyPubMed"))
33
34 parser <- ArgumentParser()
35 parser$add_argument("-i", "--input",
36 help = "Input fie name. add path if file is not in working directory")
37 parser$add_argument("-o", "--output", default = "pubmed_by_queries_output",
38 help = "Output file name. [default \"%(default)s\"]")
39 parser$add_argument("-n", "--number", type = "integer", default = 5,
40 help = "Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]")
41 parser$add_argument("-a", "--abstract", action = "store_true", default = FALSE,
42 help = "If abstracts instead of PMIDs should be retrieved use --abstracts ")
43 parser$add_argument("-k", "--key", type = "character",
44 help = "If ncbi API key is available, add it to speed up the download of PubMed data. For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).")
45 parser$add_argument("--install_packages", action = "store_true", default = FALSE,
46 help = "If you want to auto install missing required packages.")
47 args <- parser$parse_args()
48
49 if (!is.null(args$key)) {
50 if (file.exists(args$key)) {
51 credentials <- read.table(args$key, quote = "\"", comment.char = "")
52 args$key <- credentials[1, 1]
53 }
54 }
55
56 max_web_tries <- 100
57
58 data <- read.delim(args$input, stringsAsFactors = FALSE)
59
60 id_col_index <- grep("ID_", names(data))
61
62
63 fetch_pmids <- function(data, number, pubmed_search, query, row, max_web_tries) {
64 my_pubmed_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
65 "db=pubmed&retmax=", number,
66 "&term=", pubmed_search$OriginalQuery,
67 "&usehistory=n", sep = "")
68 # get ids
69 idxml <- c()
70 for (i in seq(max_web_tries)) {
71 tryCatch({
72 id_connect <- suppressWarnings(url(my_pubmed_url, open = "rb", encoding = "UTF8"))
73 idxml <- suppressWarnings(readLines(id_connect, warn = FALSE, encoding = "UTF8"))
74 suppressWarnings(close(id_connect))
75 break
76 }, error = function(e) {
77 print(paste("Error getting URL, sleeping", 2 * i, "seconds."))
78 print(e)
79 Sys.sleep(time = 2 * i)
80 })
81 }
82 pmids <- c()
83 for (i in seq(length(idxml))) {
84 if (grepl("^<Id>", idxml[i])) {
85 pmid <- custom_grep(idxml[i], tag = "Id", format = "char")
86 pmids <- c(pmids, as.character(pmid[1]))
87 }
88 }
89 if (length(pmids) > 0) {
90 data[row, sapply(seq(length(pmids)), function(i) {
91 paste0("PMID_", i)
92 })] <- pmids
93 cat(length(pmids), " PMIDs for ", query, " are added in the table.", "\n")
94 }
95 return(data)
96 }
97
98
99 fetch_abstracts <- function(data, number, query, pubmed_search) {
100 efetch_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?",
101 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey,
102 "&retstart=", 0, "&retmax=", number,
103 "&rettype=", "null", "&retmode=", "xml", sep = "")
104 api_key <- pubmed_search$APIkey
105 if (!is.null(api_key)) {
106 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
107 }
108 # initialize
109 out_data <- NULL
110 try_num <- 1
111 t_0 <- Sys.time()
112 # Try to fetch results
113 while (is.null(out_data)) {
114 # Timing check: kill at 3 min
115 if (try_num > 1) {
116 Sys.sleep(time = 2 * try_num)
117 cat("Problem to receive PubMed data or error is received. Please wait. Try number:",
118 try_num, "\n")
119 }
120 t_1 <- Sys.time()
121 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) {
122 message("Killing the request! Something is not working. Please, try again later",
123 "\n")
124 return(data)
125 }
126 # ENTREZ server connect
127 out_data <- tryCatch({
128 tmp_connect <- suppressWarnings(url(efetch_url,
129 open = "rb",
130 encoding = "UTF8"))
131 suppressWarnings(readLines(tmp_connect,
132 warn = FALSE,
133 encoding = "UTF8"))
134 }, error = function(e) {
135 print(e)
136 }, finally = {
137 try(suppressWarnings(close(tmp_connect)),
138 silent = TRUE)
139 })
140 # Check if error
141 if (!is.null(out_data) &&
142 class(out_data) == "character" &&
143 grepl("<ERROR>", substr(paste(utils::head(out_data, n = 100),
144 collapse = ""), 1, 250))) {
145 out_data <- NULL
146 }
147 try_num <- try_num + 1
148 }
149 if (is.null(out_data)) {
150 message("Killing the request! Something is not working. Please, try again later",
151 "\n")
152 return(data)
153 } else {
154 return(out_data)
155 }
156 }
157
158
159 process_xml_abstracts <- function(out_data) {
160 xml_data <- paste(out_data, collapse = "")
161 # articles to list
162 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
163 xml_data <- sapply(xml_data, function(x) {
164 #trim extra stuff at the end of the record
165 if (!grepl("</PubmedArticle>$", x))
166 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
167 # Rebuid XML structure and proceed
168 x <- paste("<PubmedArticle>", x)
169 gsub("[[:space:]]{2,}", " ", x)
170 },
171 USE.NAMES = FALSE, simplify = TRUE)
172 #titles
173 titles <- sapply(xml_data, function(x) {
174 x <- custom_grep(x, tag = "ArticleTitle", format = "char")
175 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
176 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
177 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
178 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
179 if (length(x) > 1) {
180 x <- paste(x, collapse = " ", sep = " ")
181 } else if (length(x) < 1) {
182 x <- NA
183 }
184 x
185 },
186 USE.NAMES = FALSE, simplify = TRUE)
187 # abstracts
188 abstract_text <- sapply(xml_data, function(x) {
189 custom_grep(x, tag = "AbstractText", format = "char")
190 },
191 USE.NAMES = FALSE, simplify = TRUE)
192 abstracts <- sapply(abstract_text, function(x) {
193 if (length(x) > 1) {
194 x <- paste(x, collapse = " ", sep = " ")
195 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
196 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
197 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
198 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
199 } else if (length(x) < 1) {
200 x <- NA
201 } else {
202 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
203 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
204 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
205 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
206 }
207 x
208 },
209 USE.NAMES = FALSE, simplify = TRUE)
210 #add title to abstracts
211 if (length(titles) == length(abstracts)) {
212 abstracts <- paste(titles, abstracts)
213 }
214 return(abstracts)
215 }
216
217
218 pubmed_data_in_table <- function(data, row, query, number, key, abstract) {
219 if (is.null(query)) {
220 print(data)
221 }
222 pubmed_search <- get_pubmed_ids(query, api_key = key)
223 if (as.numeric(pubmed_search$Count) == 0) {
224 cat("No PubMed result for the following query: ", query, "\n")
225 return(data)
226 } else if (abstract == FALSE) { # fetch PMIDs
227 data <- fetch_pmids(data, number, pubmed_search, query, row, max_web_tries)
228 return(data)
229 } else if (abstract == TRUE) { # fetch abstracts and title text
230 out_data <- fetch_abstracts(data, number, query, pubmed_search)
231 abstracts <- process_xml_abstracts(out_data)
232 #add abstracts to data frame
233 if (length(abstracts) > 0) {
234 data[row, sapply(seq(length(abstracts)),
235 function(i) {
236 paste0("ABSTRACT_", i)
237 })] <- abstracts
238 cat(length(abstracts), " abstracts for ", query, " are added in the table.",
239 "\n")
240 }
241 return(data)
242 }
243 }
244
245 for (i in seq(nrow(data))) {
246 data <- tryCatch(pubmed_data_in_table(data = data,
247 row = i,
248 query = data[i, id_col_index],
249 number = args$number,
250 key = args$key,
251 abstract = args$abstract), error = function(e) {
252 print("main error")
253 print(e)
254 Sys.sleep(5)
255 })
256 }
257
258 write.table(data, args$output, append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)