Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix
comparison pubmed_by_queries.R @ 0:3f4adc85ba5d draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:01:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3f4adc85ba5d |
---|---|
1 #!/usr/bin/env Rscript | |
2 #tool: pubmed_by_queries | |
3 # | |
4 #This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed. | |
5 #PubMed's search rules and syntax apply. | |
6 # | |
7 #Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes. | |
8 # | |
9 # Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed. | |
10 # | |
11 #Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] | |
12 # | |
13 # optional arguments: | |
14 # -h, --help show this help message and exit | |
15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
16 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] | |
17 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] | |
18 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts | |
19 # -k KEY, --key KEY if NCBI API key is available, add it to speed up the fetching of pubmed data | |
20 | |
21 if ( '--install_packages' %in% commandArgs()) { | |
22 print('Installing packages') | |
23 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); | |
24 if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/"); | |
25 } | |
26 | |
27 suppressPackageStartupMessages(library("argparse")) | |
28 suppressPackageStartupMessages(library("easyPubMed")) | |
29 | |
30 parser <- ArgumentParser() | |
31 parser$add_argument("-i", "--input", | |
32 help = "input fie name. add path if file is not in working directory") | |
33 parser$add_argument("-o", "--output", default="pubmed_by_queries_output", | |
34 help = "output file name. [default \"%(default)s\"]") | |
35 parser$add_argument("-n", "--number", type="integer", default=5, | |
36 help="Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") | |
37 parser$add_argument("-a", "--abstract", action="store_true", default=FALSE, | |
38 help="if abstracts instead of PMIDs should be retrieved use --abstracts ") | |
39 parser$add_argument("-k", "--key", type="character", | |
40 help="if ncbi API key is available, add it to speed up the download of pubmed data") | |
41 parser$add_argument("--install_packages", action="store_true", default=FALSE, | |
42 help="If you want to auto install missing required packages.") | |
43 args <- parser$parse_args() | |
44 | |
45 MAX_WEB_TRIES = 100 | |
46 | |
47 data = read.delim(args$input, stringsAsFactors=FALSE) | |
48 | |
49 id_col_index <- grep("ID_", names(data)) | |
50 | |
51 pubmed_data_in_table <- function(data, row, query, number, key, abstract){ | |
52 if (is.null(query)){print(data)} | |
53 pubmed_search <- get_pubmed_ids(query, api_key = key) | |
54 | |
55 if(as.numeric(pubmed_search$Count) == 0){ | |
56 cat("No PubMed result for the following query: ", query, "\n") | |
57 return(data) | |
58 | |
59 } else if (abstract == FALSE) { # fetch PMIDs | |
60 | |
61 myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", | |
62 "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "") | |
63 # get ids | |
64 idXML <- c() | |
65 for (i in 1:MAX_WEB_TRIES){ | |
66 tryCatch({ | |
67 IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8")) | |
68 idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8")) | |
69 suppressWarnings(close(IDconnect)) | |
70 break | |
71 }, error = function(e) { | |
72 print(paste('Error getting URL, sleeping',2*i,'seconds.')) | |
73 print(e) | |
74 Sys.sleep(time = 2*i) | |
75 }) | |
76 } | |
77 | |
78 PMIDs = c() | |
79 | |
80 for (i in 1:length(idXML)) { | |
81 if (grepl("^<Id>", idXML[i])) { | |
82 pmid <- custom_grep(idXML[i], tag = "Id", format = "char") | |
83 PMIDs <- c(PMIDs, as.character(pmid[1])) | |
84 } | |
85 } | |
86 | |
87 | |
88 if(length(PMIDs)>0){ | |
89 data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs | |
90 cat(length(PMIDs)," PMIDs for ",query, " are added in the table.", "\n") | |
91 } | |
92 | |
93 return(data) | |
94 | |
95 } else if (abstract == TRUE) { # fetch abstracts and title text | |
96 | |
97 efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", | |
98 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, | |
99 "&retstart=", 0, "&retmax=", number, | |
100 "&rettype=", "null","&retmode=", "xml", sep = "") | |
101 | |
102 api_key <- pubmed_search$APIkey | |
103 if (!is.null(api_key)) { | |
104 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") | |
105 } | |
106 | |
107 # initialize | |
108 out.data <- NULL | |
109 try_num <- 1 | |
110 t_0 <- Sys.time() | |
111 | |
112 # Try to fetch results | |
113 while(is.null(out.data)) { | |
114 | |
115 # Timing check: kill at 3 min | |
116 if (try_num > 1){ | |
117 Sys.sleep(time = 2*try_num) | |
118 cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n") | |
119 } | |
120 | |
121 t_1 <- Sys.time() | |
122 | |
123 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | |
124 message("Killing the request! Something is not working. Please, try again later","\n") | |
125 return(data) | |
126 } | |
127 | |
128 # ENTREZ server connect | |
129 out.data <- tryCatch({ | |
130 tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8")) | |
131 suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8")) | |
132 }, error = function(e) { | |
133 print(e) | |
134 }, finally = { | |
135 try(suppressWarnings(close(tmpConnect)), silent = TRUE) | |
136 }) | |
137 | |
138 # Check if error | |
139 if (!is.null(out.data) && | |
140 class(out.data) == "character" && | |
141 grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) { | |
142 out.data <- NULL | |
143 } | |
144 try_num <- try_num + 1 | |
145 } | |
146 | |
147 if (is.null(out.data)) { | |
148 message("Killing the request! Something is not working. Please, try again later","\n") | |
149 return(data) | |
150 } | |
151 | |
152 # process xml data | |
153 xml_data <- paste(out.data, collapse = "") | |
154 | |
155 # articles to list | |
156 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
157 xml_data <- sapply(xml_data, function(x) { | |
158 #trim extra stuff at the end of the record | |
159 if (!grepl("</PubmedArticle>$", x)) | |
160 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
161 # Rebuid XML structure and proceed | |
162 x <- paste("<PubmedArticle>", x) | |
163 gsub("[[:space:]]{2,}", " ", x)}, | |
164 USE.NAMES = FALSE, simplify = TRUE) | |
165 | |
166 #titles | |
167 titles = sapply(xml_data, function(x){ | |
168 x = custom_grep(x, tag="ArticleTitle", format="char") | |
169 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
170 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
171 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
172 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
173 if (length(x) > 1){ | |
174 x <- paste(x, collapse = " ", sep = " ") | |
175 } else if (length(x) < 1) { | |
176 x <- NA | |
177 } | |
178 x | |
179 }, | |
180 USE.NAMES = FALSE, simplify = TRUE) | |
181 | |
182 # abstracts | |
183 abstract.text = sapply(xml_data, function(x){ | |
184 custom_grep(x, tag="AbstractText", format="char")}, | |
185 USE.NAMES = FALSE, simplify = TRUE) | |
186 | |
187 abstracts <- sapply(abstract.text, function(x){ | |
188 if (length(x) > 1){ | |
189 x <- paste(x, collapse = " ", sep = " ") | |
190 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
191 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
192 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
193 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
194 } else if (length(x) < 1) { | |
195 x <- NA | |
196 } else { | |
197 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
198 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
199 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
200 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
201 } | |
202 x | |
203 }, | |
204 USE.NAMES = FALSE, simplify = TRUE) | |
205 | |
206 #add title to abstracts | |
207 if (length(titles) == length(abstracts)){ | |
208 abstracts = paste(titles, abstracts) | |
209 } | |
210 | |
211 #add abstracts to data frame | |
212 if(length(abstracts)>0){ | |
213 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts | |
214 cat(length(abstracts)," abstracts for ",query, " are added in the table.", "\n") | |
215 } | |
216 | |
217 return(data) | |
218 } | |
219 } | |
220 | |
221 for(i in 1:nrow(data)){ | |
222 data = tryCatch(pubmed_data_in_table(data= data, | |
223 row= i, | |
224 query= data[i,id_col_index], | |
225 number= args$number, | |
226 key= args$key, | |
227 abstract= args$abstract), error=function(e){ | |
228 print('main error') | |
229 print(e) | |
230 Sys.sleep(5) | |
231 }) | |
232 } | |
233 | |
234 write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) | |
235 | |
236 |