Mercurial > repos > iuc > text_to_wordmatrix
comparison pubmed_by_queries.R @ 0:0692d11af909 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author | iuc |
---|---|
date | Wed, 24 Mar 2021 08:33:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0692d11af909 |
---|---|
1 #!/usr/bin/env Rscript | |
2 #tool: pubmed_by_queries | |
3 # | |
4 #This tool uses a set of search queries to download a defined number of abstracts or | |
5 #PMIDs for search query from PubMed. PubMed's search rules and syntax apply. | |
6 # | |
7 #Input: Tab-delimited table with search queries in a column starting with "ID_", | |
8 #e.g. "ID_gene" if search queries are genes. | |
9 # | |
10 #Output: Input table with additional columns | |
11 #with PMIDs or abstracts (--abstracts) from PubMed. | |
12 # | |
13 #Usage: | |
14 #$pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] | |
15 # | |
16 #optional arguments: | |
17 # -h, --help show this help message and exit | |
18 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
19 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] | |
20 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] | |
21 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts | |
22 # -k KEY, --key KEY if ncbi API key is available, add it to speed up the download of PubMed data. | |
23 # For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information). | |
24 | |
25 if ("--install_packages" %in% commandArgs()) { | |
26 print("Installing packages") | |
27 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/") ; | |
28 if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/") ; | |
29 } | |
30 | |
31 suppressPackageStartupMessages(library("argparse")) | |
32 suppressPackageStartupMessages(library("easyPubMed")) | |
33 | |
34 parser <- ArgumentParser() | |
35 parser$add_argument("-i", "--input", | |
36 help = "Input fie name. add path if file is not in working directory") | |
37 parser$add_argument("-o", "--output", default = "pubmed_by_queries_output", | |
38 help = "Output file name. [default \"%(default)s\"]") | |
39 parser$add_argument("-n", "--number", type = "integer", default = 5, | |
40 help = "Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") | |
41 parser$add_argument("-a", "--abstract", action = "store_true", default = FALSE, | |
42 help = "If abstracts instead of PMIDs should be retrieved use --abstracts ") | |
43 parser$add_argument("-k", "--key", type = "character", | |
44 help = "If ncbi API key is available, add it to speed up the download of PubMed data. For usage in Galaxy add the API key to the Galaxy user-preferences (User/ Preferences/ Manage Information).") | |
45 parser$add_argument("--install_packages", action = "store_true", default = FALSE, | |
46 help = "If you want to auto install missing required packages.") | |
47 args <- parser$parse_args() | |
48 | |
49 if (!is.null(args$key)) { | |
50 if (file.exists(args$key)) { | |
51 credentials <- read.table(args$key, quote = "\"", comment.char = "") | |
52 args$key <- credentials[1, 1] | |
53 } | |
54 } | |
55 | |
56 max_web_tries <- 100 | |
57 | |
58 data <- read.delim(args$input, stringsAsFactors = FALSE) | |
59 | |
60 id_col_index <- grep("ID_", names(data)) | |
61 | |
62 | |
63 fetch_pmids <- function(data, number, pubmed_search, query, row, max_web_tries) { | |
64 my_pubmed_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", | |
65 "db=pubmed&retmax=", number, | |
66 "&term=", pubmed_search$OriginalQuery, | |
67 "&usehistory=n", sep = "") | |
68 # get ids | |
69 idxml <- c() | |
70 for (i in seq(max_web_tries)) { | |
71 tryCatch({ | |
72 id_connect <- suppressWarnings(url(my_pubmed_url, open = "rb", encoding = "UTF8")) | |
73 idxml <- suppressWarnings(readLines(id_connect, warn = FALSE, encoding = "UTF8")) | |
74 suppressWarnings(close(id_connect)) | |
75 break | |
76 }, error = function(e) { | |
77 print(paste("Error getting URL, sleeping", 2 * i, "seconds.")) | |
78 print(e) | |
79 Sys.sleep(time = 2 * i) | |
80 }) | |
81 } | |
82 pmids <- c() | |
83 for (i in seq(length(idxml))) { | |
84 if (grepl("^<Id>", idxml[i])) { | |
85 pmid <- custom_grep(idxml[i], tag = "Id", format = "char") | |
86 pmids <- c(pmids, as.character(pmid[1])) | |
87 } | |
88 } | |
89 if (length(pmids) > 0) { | |
90 data[row, sapply(seq(length(pmids)), function(i) { | |
91 paste0("PMID_", i) | |
92 })] <- pmids | |
93 cat(length(pmids), " PMIDs for ", query, " are added in the table.", "\n") | |
94 } | |
95 return(data) | |
96 } | |
97 | |
98 | |
99 fetch_abstracts <- function(data, number, query, pubmed_search) { | |
100 efetch_url <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", | |
101 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, | |
102 "&retstart=", 0, "&retmax=", number, | |
103 "&rettype=", "null", "&retmode=", "xml", sep = "") | |
104 api_key <- pubmed_search$APIkey | |
105 if (!is.null(api_key)) { | |
106 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") | |
107 } | |
108 # initialize | |
109 out_data <- NULL | |
110 try_num <- 1 | |
111 t_0 <- Sys.time() | |
112 # Try to fetch results | |
113 while (is.null(out_data)) { | |
114 # Timing check: kill at 3 min | |
115 if (try_num > 1) { | |
116 Sys.sleep(time = 2 * try_num) | |
117 cat("Problem to receive PubMed data or error is received. Please wait. Try number:", | |
118 try_num, "\n") | |
119 } | |
120 t_1 <- Sys.time() | |
121 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) { | |
122 message("Killing the request! Something is not working. Please, try again later", | |
123 "\n") | |
124 return(data) | |
125 } | |
126 # ENTREZ server connect | |
127 out_data <- tryCatch({ | |
128 tmp_connect <- suppressWarnings(url(efetch_url, | |
129 open = "rb", | |
130 encoding = "UTF8")) | |
131 suppressWarnings(readLines(tmp_connect, | |
132 warn = FALSE, | |
133 encoding = "UTF8")) | |
134 }, error = function(e) { | |
135 print(e) | |
136 }, finally = { | |
137 try(suppressWarnings(close(tmp_connect)), | |
138 silent = TRUE) | |
139 }) | |
140 # Check if error | |
141 if (!is.null(out_data) && | |
142 class(out_data) == "character" && | |
143 grepl("<ERROR>", substr(paste(utils::head(out_data, n = 100), | |
144 collapse = ""), 1, 250))) { | |
145 out_data <- NULL | |
146 } | |
147 try_num <- try_num + 1 | |
148 } | |
149 if (is.null(out_data)) { | |
150 message("Killing the request! Something is not working. Please, try again later", | |
151 "\n") | |
152 return(data) | |
153 } else { | |
154 return(out_data) | |
155 } | |
156 } | |
157 | |
158 | |
159 process_xml_abstracts <- function(out_data) { | |
160 xml_data <- paste(out_data, collapse = "") | |
161 # articles to list | |
162 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
163 xml_data <- sapply(xml_data, function(x) { | |
164 #trim extra stuff at the end of the record | |
165 if (!grepl("</PubmedArticle>$", x)) | |
166 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
167 # Rebuid XML structure and proceed | |
168 x <- paste("<PubmedArticle>", x) | |
169 gsub("[[:space:]]{2,}", " ", x) | |
170 }, | |
171 USE.NAMES = FALSE, simplify = TRUE) | |
172 #titles | |
173 titles <- sapply(xml_data, function(x) { | |
174 x <- custom_grep(x, tag = "ArticleTitle", format = "char") | |
175 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
176 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
177 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
178 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
179 if (length(x) > 1) { | |
180 x <- paste(x, collapse = " ", sep = " ") | |
181 } else if (length(x) < 1) { | |
182 x <- NA | |
183 } | |
184 x | |
185 }, | |
186 USE.NAMES = FALSE, simplify = TRUE) | |
187 # abstracts | |
188 abstract_text <- sapply(xml_data, function(x) { | |
189 custom_grep(x, tag = "AbstractText", format = "char") | |
190 }, | |
191 USE.NAMES = FALSE, simplify = TRUE) | |
192 abstracts <- sapply(abstract_text, function(x) { | |
193 if (length(x) > 1) { | |
194 x <- paste(x, collapse = " ", sep = " ") | |
195 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
196 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
197 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
198 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
199 } else if (length(x) < 1) { | |
200 x <- NA | |
201 } else { | |
202 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
203 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
204 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
205 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
206 } | |
207 x | |
208 }, | |
209 USE.NAMES = FALSE, simplify = TRUE) | |
210 #add title to abstracts | |
211 if (length(titles) == length(abstracts)) { | |
212 abstracts <- paste(titles, abstracts) | |
213 } | |
214 return(abstracts) | |
215 } | |
216 | |
217 | |
218 pubmed_data_in_table <- function(data, row, query, number, key, abstract) { | |
219 if (is.null(query)) { | |
220 print(data) | |
221 } | |
222 pubmed_search <- get_pubmed_ids(query, api_key = key) | |
223 if (as.numeric(pubmed_search$Count) == 0) { | |
224 cat("No PubMed result for the following query: ", query, "\n") | |
225 return(data) | |
226 } else if (abstract == FALSE) { # fetch PMIDs | |
227 data <- fetch_pmids(data, number, pubmed_search, query, row, max_web_tries) | |
228 return(data) | |
229 } else if (abstract == TRUE) { # fetch abstracts and title text | |
230 out_data <- fetch_abstracts(data, number, query, pubmed_search) | |
231 abstracts <- process_xml_abstracts(out_data) | |
232 #add abstracts to data frame | |
233 if (length(abstracts) > 0) { | |
234 data[row, sapply(seq(length(abstracts)), | |
235 function(i) { | |
236 paste0("ABSTRACT_", i) | |
237 })] <- abstracts | |
238 cat(length(abstracts), " abstracts for ", query, " are added in the table.", | |
239 "\n") | |
240 } | |
241 return(data) | |
242 } | |
243 } | |
244 | |
245 for (i in seq(nrow(data))) { | |
246 data <- tryCatch(pubmed_data_in_table(data = data, | |
247 row = i, | |
248 query = data[i, id_col_index], | |
249 number = args$number, | |
250 key = args$key, | |
251 abstract = args$abstract), error = function(e) { | |
252 print("main error") | |
253 print(e) | |
254 Sys.sleep(5) | |
255 }) | |
256 } | |
257 | |
258 write.table(data, args$output, append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) |