comparison abstracts_by_pmids.R @ 0:f40606281050 draft

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author dlalgroup
date Thu, 24 Sep 2020 03:01:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f40606281050
1 #!/usr/bin/env Rscript
2 #TOOL2 abstracts_by_pmids
3 #
4 #This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns.
5 #
6 #Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc.
7 #
8 #Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed.
9 #The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc.
10 #
11 # Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT]
12 #
13 # optional arguments:
14 # -h, --help show help message
15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory
16 # -o OUTPUT, --output OUTPUT output file name. [default "T2_output"]
17
18
19 if ( '--install_packages' %in% commandArgs()) {
20 print('Installing packages')
21 if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/");
22 if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/");
23 if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" );
24 if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/");
25 }
26
27 suppressPackageStartupMessages(library("argparse"))
28 library("reutils")
29 suppressPackageStartupMessages(library("easyPubMed"))
30 suppressPackageStartupMessages(library("textclean"))
31
32 parser <- ArgumentParser()
33 parser$add_argument("-i", "--input",
34 help = "input fie name. add path if file is not in workind directory")
35 parser$add_argument("-o", "--output", default="abstracts_by_pmids_output",
36 help = "output file name. [default \"%(default)s\"]")
37 parser$add_argument("--install_packages", action="store_true", default=FALSE,
38 help="If you want to auto install missing required packages.")
39
40 args <- parser$parse_args()
41
42 data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t')
43 pmids_cols_index <- grep("PMID", names(data))
44
45 fetch_abstracts = function(PMIDs, row){
46
47 efetch_result <- NULL
48 try_num <- 1
49 t_0 <- Sys.time()
50
51 while(is.null(efetch_result)) {
52
53 # Timing check: kill at 3 min
54 if (try_num > 1){
55 Sys.sleep(time = 1*try_num)
56 cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n")
57 }
58
59 t_1 <- Sys.time()
60
61 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
62 message("Killing the request! Something is not working. Please, try again later","\n")
63 return(data)
64 }
65
66 efetch_result <- tryCatch({
67 suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml"))
68 }, error = function(e) {
69 NULL
70 })
71
72 if(!is.null(as.list(efetch_result$errors)$error)){
73 if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") {
74 efetch_result <- NULL
75 }
76 }
77
78 try_num <- try_num + 1
79
80 } #while loop end
81
82 # articles to list
83 xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
84 xml_data <- sapply(xml_data, function(x) {
85 #trim extra stuff at the end of the record
86 if (!grepl("</PubmedArticle>$", x))
87 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
88 # Rebuid XML structure and proceed
89 x <- paste("<PubmedArticle>", x)
90 gsub("[[:space:]]{2,}", " ", x)},
91 USE.NAMES = FALSE, simplify = TRUE)
92
93 abstract.text = sapply(xml_data, function(x){
94 custom_grep(x, tag="AbstractText", format="char")},
95 USE.NAMES = FALSE, simplify = TRUE)
96
97 abstracts <- sapply(abstract.text, function(x){
98 if (length(x) > 1){
99 x <- paste(x, collapse = " ", sep = " ")
100 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
101 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
102 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
103 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
104 } else if (length(x) < 1) {
105 x <- NA
106 } else {
107 x <- gsub("</{0,1}i>", "", x, ignore.case = T)
108 x <- gsub("</{0,1}b>", "", x, ignore.case = T)
109 x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
110 x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
111 }
112 x
113 },
114 USE.NAMES = FALSE, simplify = TRUE)
115
116 abstracts = as.character(abstracts)
117
118 if(length(abstracts)>0){
119 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts
120 cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n")
121 }
122
123 return(data)
124 }
125
126
127 for(row in 1:nrow(data)){
128 PMIDs= as.character(unique(data[row, pmids_cols_index]))
129 PMIDs = PMIDs[!PMIDs=="NA"]
130
131 if(length(PMIDs) > 0){
132 data = tryCatch(fetch_abstracts(PMIDs, row),
133 error=function(e){
134 Sys.sleep(3)
135 })
136 } else {
137 print(paste("No PMIDs in row", row))
138 }
139 }
140
141 write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE)
142