comparison pubmed_by_queries.R @ 0:3f4adc85ba5d draft

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author dlalgroup
date Thu, 24 Sep 2020 02:01:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3f4adc85ba5d
1 #!/usr/bin/env Rscript
2 #tool: pubmed_by_queries
3 #
4 #This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed.
5 #PubMed's search rules and syntax apply.
6 #
7 #Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes.
8 #
9 # Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed.
10 #
11 #Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY]
12 #
13 # optional arguments:
14 # -h, --help show this help message and exit
15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory
16 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"]
17 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"]
18 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts
19 # -k KEY, --key KEY if NCBI API key is available, add it to speed up the fetching of pubmed data
20
21 if ( '--install_packages' %in% commandArgs()) {
22 print('Installing packages')
23 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/");
24 if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/");
25 }
26
27 suppressPackageStartupMessages(library("argparse"))
28 suppressPackageStartupMessages(library("easyPubMed"))
29
30 parser <- ArgumentParser()
31 parser$add_argument("-i", "--input",
32 help = "input fie name. add path if file is not in working directory")
33 parser$add_argument("-o", "--output", default="pubmed_by_queries_output",
34 help = "output file name. [default \"%(default)s\"]")
35 parser$add_argument("-n", "--number", type="integer", default=5,
36 help="Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]")
37 parser$add_argument("-a", "--abstract", action="store_true", default=FALSE,
38 help="if abstracts instead of PMIDs should be retrieved use --abstracts ")
39 parser$add_argument("-k", "--key", type="character",
40 help="if ncbi API key is available, add it to speed up the download of pubmed data")
41 parser$add_argument("--install_packages", action="store_true", default=FALSE,
42 help="If you want to auto install missing required packages.")
43 args <- parser$parse_args()
44
45 MAX_WEB_TRIES = 100
46
47 data = read.delim(args$input, stringsAsFactors=FALSE)
48
49 id_col_index <- grep("ID_", names(data))
50
51 pubmed_data_in_table <- function(data, row, query, number, key, abstract){
52 if (is.null(query)){print(data)}
53 pubmed_search <- get_pubmed_ids(query, api_key = key)
54
55 if(as.numeric(pubmed_search$Count) == 0){
56 cat("No PubMed result for the following query: ", query, "\n")
57 return(data)
58
59 } else if (abstract == FALSE) { # fetch PMIDs
60
61 myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
62 "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "")
63 # get ids
64 idXML <- c()
65 for (i in 1:MAX_WEB_TRIES){
66 tryCatch({
67 IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8"))
68 idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8"))
69 suppressWarnings(close(IDconnect))
70 break
71 }, error = function(e) {
72 print(paste('Error getting URL, sleeping',2*i,'seconds.'))
73 print(e)
74 Sys.sleep(time = 2*i)
75 })
76 }
77
78 PMIDs = c()
79
80 for (i in 1:length(idXML)) {
81 if (grepl("^<Id>", idXML[i])) {
82 pmid <- custom_grep(idXML[i], tag = "Id", format = "char")
83 PMIDs <- c(PMIDs, as.character(pmid[1]))
84 }
85 }
86
87
88 if(length(PMIDs)>0){
89 data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs
90 cat(length(PMIDs)," PMIDs for ",query, " are added in the table.", "\n")
91 }
92
93 return(data)
94
95 } else if (abstract == TRUE) { # fetch abstracts and title text
96
97 efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?",
98 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey,
99 "&retstart=", 0, "&retmax=", number,
100 "&rettype=", "null","&retmode=", "xml", sep = "")
101
102 api_key <- pubmed_search$APIkey
103 if (!is.null(api_key)) {
104 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
105 }
106
107 # initialize
108 out.data <- NULL
109 try_num <- 1
110 t_0 <- Sys.time()
111
112 # Try to fetch results
113 while(is.null(out.data)) {
114
115 # Timing check: kill at 3 min
116 if (try_num > 1){
117 Sys.sleep(time = 2*try_num)
118 cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n")
119 }
120
121 t_1 <- Sys.time()
122
123 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
124 message("Killing the request! Something is not working. Please, try again later","\n")
125 return(data)
126 }
127
128 # ENTREZ server connect
129 out.data <- tryCatch({
130 tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8"))
131 suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8"))
132 }, error = function(e) {
133 print(e)
134 }, finally = {
135 try(suppressWarnings(close(tmpConnect)), silent = TRUE)
136 })
137
138 # Check if error
139 if (!is.null(out.data) &&
140 class(out.data) == "character" &&
141 grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) {
142 out.data <- NULL
143 }
144 try_num <- try_num + 1
145 }
146
147 if (is.null(out.data)) {
148 message("Killing the request! Something is not working. Please, try again later","\n")
149 return(data)
150 }
151
152 # process xml data
153 xml_data <- paste(out.data, collapse = "")
154
155 # articles to list
156 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
157 xml_data <- sapply(xml_data, function(x) {
158 #trim extra stuff at the end of the record
159 if (!grepl("</PubmedArticle>$", x))
160 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
161 # Rebuid XML structure and proceed
162 x <- paste("<PubmedArticle>", x)
163 gsub("[[:space:]]{2,}", " ", x)},
164 USE.NAMES = FALSE, simplify = TRUE)
165
166 #titles
167 titles = sapply(xml_data, function(x){
168 x = custom_grep(x, tag="ArticleTitle", format="char")
169 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
170 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
171 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
172 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
173 if (length(x) > 1){
174 x <- paste(x, collapse = " ", sep = " ")
175 } else if (length(x) < 1) {
176 x <- NA
177 }
178 x
179 },
180 USE.NAMES = FALSE, simplify = TRUE)
181
182 # abstracts
183 abstract.text = sapply(xml_data, function(x){
184 custom_grep(x, tag="AbstractText", format="char")},
185 USE.NAMES = FALSE, simplify = TRUE)
186
187 abstracts <- sapply(abstract.text, function(x){
188 if (length(x) > 1){
189 x <- paste(x, collapse = " ", sep = " ")
190 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
191 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
192 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
193 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
194 } else if (length(x) < 1) {
195 x <- NA
196 } else {
197 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
198 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
199 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
200 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
201 }
202 x
203 },
204 USE.NAMES = FALSE, simplify = TRUE)
205
206 #add title to abstracts
207 if (length(titles) == length(abstracts)){
208 abstracts = paste(titles, abstracts)
209 }
210
211 #add abstracts to data frame
212 if(length(abstracts)>0){
213 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts
214 cat(length(abstracts)," abstracts for ",query, " are added in the table.", "\n")
215 }
216
217 return(data)
218 }
219 }
220
221 for(i in 1:nrow(data)){
222 data = tryCatch(pubmed_data_in_table(data= data,
223 row= i,
224 query= data[i,id_col_index],
225 number= args$number,
226 key= args$key,
227 abstract= args$abstract), error=function(e){
228 print('main error')
229 print(e)
230 Sys.sleep(5)
231 })
232 }
233
234 write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)
235
236