Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix
comparison abstracts_by_pmids.R @ 0:3f4adc85ba5d draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:01:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3f4adc85ba5d |
---|---|
1 #!/usr/bin/env Rscript | |
2 #TOOL2 abstracts_by_pmids | |
3 # | |
4 #This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns. | |
5 # | |
6 #Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc. | |
7 # | |
8 #Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed. | |
9 #The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc. | |
10 # | |
11 # Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT] | |
12 # | |
13 # optional arguments: | |
14 # -h, --help show help message | |
15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
16 # -o OUTPUT, --output OUTPUT output file name. [default "T2_output"] | |
17 | |
18 | |
19 if ( '--install_packages' %in% commandArgs()) { | |
20 print('Installing packages') | |
21 if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/"); | |
22 if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/"); | |
23 if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" ); | |
24 if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/"); | |
25 } | |
26 | |
27 suppressPackageStartupMessages(library("argparse")) | |
28 library("reutils") | |
29 suppressPackageStartupMessages(library("easyPubMed")) | |
30 suppressPackageStartupMessages(library("textclean")) | |
31 | |
32 parser <- ArgumentParser() | |
33 parser$add_argument("-i", "--input", | |
34 help = "input fie name. add path if file is not in workind directory") | |
35 parser$add_argument("-o", "--output", default="abstracts_by_pmids_output", | |
36 help = "output file name. [default \"%(default)s\"]") | |
37 parser$add_argument("--install_packages", action="store_true", default=FALSE, | |
38 help="If you want to auto install missing required packages.") | |
39 | |
40 args <- parser$parse_args() | |
41 | |
42 data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t') | |
43 pmids_cols_index <- grep("PMID", names(data)) | |
44 | |
45 fetch_abstracts = function(PMIDs, row){ | |
46 | |
47 efetch_result <- NULL | |
48 try_num <- 1 | |
49 t_0 <- Sys.time() | |
50 | |
51 while(is.null(efetch_result)) { | |
52 | |
53 # Timing check: kill at 3 min | |
54 if (try_num > 1){ | |
55 Sys.sleep(time = 1*try_num) | |
56 cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n") | |
57 } | |
58 | |
59 t_1 <- Sys.time() | |
60 | |
61 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | |
62 message("Killing the request! Something is not working. Please, try again later","\n") | |
63 return(data) | |
64 } | |
65 | |
66 efetch_result <- tryCatch({ | |
67 suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml")) | |
68 }, error = function(e) { | |
69 NULL | |
70 }) | |
71 | |
72 if(!is.null(as.list(efetch_result$errors)$error)){ | |
73 if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") { | |
74 efetch_result <- NULL | |
75 } | |
76 } | |
77 | |
78 try_num <- try_num + 1 | |
79 | |
80 } #while loop end | |
81 | |
82 # articles to list | |
83 xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
84 xml_data <- sapply(xml_data, function(x) { | |
85 #trim extra stuff at the end of the record | |
86 if (!grepl("</PubmedArticle>$", x)) | |
87 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
88 # Rebuid XML structure and proceed | |
89 x <- paste("<PubmedArticle>", x) | |
90 gsub("[[:space:]]{2,}", " ", x)}, | |
91 USE.NAMES = FALSE, simplify = TRUE) | |
92 | |
93 abstract.text = sapply(xml_data, function(x){ | |
94 custom_grep(x, tag="AbstractText", format="char")}, | |
95 USE.NAMES = FALSE, simplify = TRUE) | |
96 | |
97 abstracts <- sapply(abstract.text, function(x){ | |
98 if (length(x) > 1){ | |
99 x <- paste(x, collapse = " ", sep = " ") | |
100 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
101 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
102 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
103 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
104 } else if (length(x) < 1) { | |
105 x <- NA | |
106 } else { | |
107 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
108 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
109 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
110 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
111 } | |
112 x | |
113 }, | |
114 USE.NAMES = FALSE, simplify = TRUE) | |
115 | |
116 abstracts = as.character(abstracts) | |
117 | |
118 if(length(abstracts)>0){ | |
119 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts | |
120 cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n") | |
121 } | |
122 | |
123 return(data) | |
124 } | |
125 | |
126 | |
127 for(row in 1:nrow(data)){ | |
128 PMIDs= as.character(unique(data[row, pmids_cols_index])) | |
129 PMIDs = PMIDs[!PMIDs=="NA"] | |
130 | |
131 if(length(PMIDs) > 0){ | |
132 data = tryCatch(fetch_abstracts(PMIDs, row), | |
133 error=function(e){ | |
134 Sys.sleep(3) | |
135 }) | |
136 } else { | |
137 print(paste("No PMIDs in row", row)) | |
138 } | |
139 } | |
140 | |
141 write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE) | |
142 |