comparison pmids_to_pubtator_matrix.R @ 0:ff904894ccaa draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author iuc
date Wed, 24 Mar 2021 08:32:54 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff904894ccaa
1 #!/usr/bin/env Rscript
2 #tool: pmids_to_pubtator_matrix
3 #
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted.
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID).
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row.
10 #
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs.
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc.
13 #
14 #Output: Binary matrix in that each column represents one of the extracted terms.
15 #
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER]
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]]
18 #
19 # optional arguments:
20 # -h, --help show help message
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"]
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...]
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"]
26
27 if ("--install_packages" %in% commandArgs()) {
28 print("Installing packages")
29 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/");
30 if (!require("stringr")) install.packages("stringr", repo = "http://cran.rstudio.com/");
31 if (!require("RCurl")) install.packages("RCurl", repo = "http://cran.rstudio.com/");
32 if (!require("stringi")) install.packages("stringi", repo = "http://cran.rstudio.com/");
33 }
34
35 suppressPackageStartupMessages(library("argparse"))
36 library("stringr")
37 library("RCurl")
38 library("stringi")
39
40 parser <- ArgumentParser()
41
42 parser$add_argument("-i", "--input",
43 help = "input fie name. add path if file is not in workind directory")
44 parser$add_argument("-o", "--output", default = "pmids_to_pubtator_matrix_output",
45 help = "output file name. [default \"%(default)s\"]")
46 parser$add_argument("-c", "--categories", choices = c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs = "+",
47 default = c("Gene", "Disease", "Mutation", "Chemical"),
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]")
49 parser$add_argument("-b", "--byid", action = "store_true", default = FALSE,
50 help = "If you want to find common gene IDs / mesh IDs instead of scientific terms.")
51 parser$add_argument("-n", "--number", default = NULL, type = "integer",
52 help = "Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.")
53 parser$add_argument("--install_packages", action = "store_true", default = FALSE,
54 help = "If you want to auto install missing required packages.")
55
56 args <- parser$parse_args()
57
58
59 data <- read.delim(args$input, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
60
61 pmid_cols_index <- grep(c("PMID"), names(data))
62 word_matrix <- data.frame()
63 dict_table <- data.frame()
64 pmids_count <- 0
65 pubtator_max_ids <- 100
66
67
68 merge_pubtator_table <- function(out_data, table) {
69 out_data <- unlist(strsplit(out_data, "\n", fixed = T))
70 for (i in 3:length(out_data)) {
71 temps <- unlist(strsplit(out_data[i], "\t", fixed = T))
72 if (length(temps) == 5) {
73 temps <- c(temps, NA)
74 }
75 if (length(temps) == 6) {
76 table <- rbind(table, temps)
77 }
78 }
79 return(table)
80 }
81
82
83 get_pubtator_terms <- function(pmids) {
84 table <- NULL
85 for (pmid_split in split(pmids, ceiling(seq_along(pmids) / pubtator_max_ids))) {
86 out_data <- NULL
87 try_num <- 1
88 t_0 <- Sys.time()
89 while (TRUE) {
90 # Timing check: kill at 3 min
91 if (try_num > 1) {
92 cat("Connection problem. Please wait. Try number:", try_num, "\n")
93 Sys.sleep(time = 2 * try_num)
94 }
95 try_num <- try_num + 1
96 t_1 <- Sys.time()
97 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) {
98 message("Killing the request! Something is not working. Please, try again later", "\n")
99 return(table)
100 }
101 out_data <- tryCatch({
102 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=",
103 paste(pmid_split, collapse = ","), sep = ""))
104 }, error = function(e) {
105 print(e)
106 next
107 }, finally = {
108 Sys.sleep(0)
109 })
110 if (!is.null(out_data)) {
111 table <- merge_pubtator_table(out_data, table)
112 break
113 }
114 }
115 }
116 return(table)
117 }
118
119 extract_category_terms <- function(table, categories) {
120 index_categories <- c()
121 categories <- as.character(unlist(categories))
122 if (ncol(table) == 6) {
123 for (i in categories) {
124 tmp_index <- grep(TRUE, i == as.character(table[, 5]))
125 if (length(tmp_index) > 0) {
126 index_categories <- c(index_categories, tmp_index)
127 }
128 }
129 table <- as.data.frame(table, stringsAsFactors = FALSE)
130 table <- table[index_categories, c(4, 6)]
131 table <- table[!is.na(table[, 2]), ]
132 table <- table[!(table[, 2] == "NA"), ]
133 table <- table[!(table[, 1] == "NA"), ]
134 }else{
135 return(NULL)
136 }
137 }
138
139 extract_frequent_ids_or_terms <- function(table) {
140 if (is.null(table)) {
141 return(NULL)
142 break
143 }
144 if (args$byid) {
145 if (!is.null(args$number)) {
146 #retrieve top X mesh_ids
147 table_mesh <- as.data.frame(table(table[, 2]))
148 colnames(table_mesh)[1] <- "mesh_id"
149 table <- table[order(table_mesh$Freq, decreasing = TRUE), ]
150 table <- table[1:min(args$number, nrow(table_mesh)), ]
151 table_mesh$mesh_id <- as.character(table_mesh$mesh_id)
152 #subset table for top X mesh_ids
153 table <- table[which(as.character(table$V6) %in% as.character(table_mesh$mesh_id)), ]
154 table <- table[!duplicated(table[, 2]), ]
155 } else {
156 table <- table[!duplicated(table[, 2]), ]
157 }
158 } else {
159 if (!is.null(args$number)) {
160 table[, 1] <- tolower(as.character(table[, 1]))
161 table <- as.data.frame(table(table[, 1]))
162 colnames(table)[1] <- "term"
163 table <- table[order(table$Freq, decreasing = TRUE), ]
164 table <- table[1:min(args$number, nrow(table)), ]
165 table$term <- as.character(table$term)
166 } else {
167 table[, 1] <- tolower(as.character(table[, 1]))
168 table <- table[!duplicated(table[, 1]), ]
169 }
170 }
171 return(table)
172 }
173
174
175 #for all PMIDs of a row get PubTator terms and add them to the matrix
176 for (i in seq(nrow(data))) {
177 pmids <- as.character(data[i, pmid_cols_index])
178 pmids <- pmids[!pmids == "NA"]
179 if (pmids_count > 10000) {
180 cat("Break (10s) to avoid killing of requests. Please wait.", "\n")
181 Sys.sleep(10)
182 pmids_count <- 0
183 }
184 pmids_count <- pmids_count + length(pmids)
185 #get puptator terms and process them with functions
186 if (length(pmids) > 0) {
187 table <- get_pubtator_terms(pmids)
188 table <- extract_category_terms(table, args$categories)
189 table <- extract_frequent_ids_or_terms(table)
190 if (!is.null(table)) {
191 colnames(table) <- c("term", "mesh_id")
192 # add data in binary matrix
193 if (args$byid) {
194 mesh_ids <- as.character(table$mesh_id)
195 if (length(mesh_ids) > 0) {
196 word_matrix[i, mesh_ids] <- 1
197 cat(length(mesh_ids), " IDs for PMIDs of row", i, " were added", "\n")
198 # add data in dictionary
199 dict_table <- rbind(dict_table, table)
200 dict_table <- dict_table[!duplicated(as.character(dict_table[, 2])), ]
201 }
202 } else {
203 terms <- as.character(table[, 1])
204 if (length(terms) > 0) {
205 word_matrix[i, terms] <- 1
206 cat(length(terms), " terms for PMIDs of row", i, " were added.", "\n")
207 }
208 }
209 }
210 } else {
211 cat("No terms for PMIDs of row", i, " were found.", "\n")
212 }
213 }
214
215 if (args$byid) {
216 #change column names of matrix: exchange mesh ids/ids with term
217 index_names <- match(names(word_matrix), as.character(dict_table[[2]]))
218 names(word_matrix) <- dict_table[index_names, 1]
219 }
220
221 colnames(word_matrix) <- gsub("[^[:print:]]", "", colnames(word_matrix))
222 colnames(word_matrix) <- gsub('\"', "", colnames(word_matrix), fixed = TRUE)
223
224 #merge duplicated columns
225 word_matrix <- as.data.frame(do.call(cbind, by(t(word_matrix), INDICES = names(word_matrix), FUN = colSums)))
226
227 #save binary matrix
228 word_matrix <- as.matrix(word_matrix)
229 word_matrix[is.na(word_matrix)] <- 0
230 cat("Matrix with ", nrow(word_matrix), " rows and ", ncol(word_matrix), " columns generated.", "\n")
231 write.table(word_matrix, args$output, row.names = FALSE, sep = "\t", quote = FALSE)