Mercurial > repos > iuc > abstracts_by_pmids
comparison pmids_to_pubtator_matrix.R @ 0:ff904894ccaa draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author | iuc |
---|---|
date | Wed, 24 Mar 2021 08:32:54 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff904894ccaa |
---|---|
1 #!/usr/bin/env Rscript | |
2 #tool: pmids_to_pubtator_matrix | |
3 # | |
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the | |
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted. | |
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term | |
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as | |
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID). | |
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row. | |
10 # | |
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs. | |
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc. | |
13 # | |
14 #Output: Binary matrix in that each column represents one of the extracted terms. | |
15 # | |
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] | |
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]] | |
18 # | |
19 # optional arguments: | |
20 # -h, --help show help message | |
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory | |
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted. | |
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"] | |
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...] | |
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"] | |
26 | |
27 if ("--install_packages" %in% commandArgs()) { | |
28 print("Installing packages") | |
29 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/"); | |
30 if (!require("stringr")) install.packages("stringr", repo = "http://cran.rstudio.com/"); | |
31 if (!require("RCurl")) install.packages("RCurl", repo = "http://cran.rstudio.com/"); | |
32 if (!require("stringi")) install.packages("stringi", repo = "http://cran.rstudio.com/"); | |
33 } | |
34 | |
35 suppressPackageStartupMessages(library("argparse")) | |
36 library("stringr") | |
37 library("RCurl") | |
38 library("stringi") | |
39 | |
40 parser <- ArgumentParser() | |
41 | |
42 parser$add_argument("-i", "--input", | |
43 help = "input fie name. add path if file is not in workind directory") | |
44 parser$add_argument("-o", "--output", default = "pmids_to_pubtator_matrix_output", | |
45 help = "output file name. [default \"%(default)s\"]") | |
46 parser$add_argument("-c", "--categories", choices = c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs = "+", | |
47 default = c("Gene", "Disease", "Mutation", "Chemical"), | |
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]") | |
49 parser$add_argument("-b", "--byid", action = "store_true", default = FALSE, | |
50 help = "If you want to find common gene IDs / mesh IDs instead of scientific terms.") | |
51 parser$add_argument("-n", "--number", default = NULL, type = "integer", | |
52 help = "Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.") | |
53 parser$add_argument("--install_packages", action = "store_true", default = FALSE, | |
54 help = "If you want to auto install missing required packages.") | |
55 | |
56 args <- parser$parse_args() | |
57 | |
58 | |
59 data <- read.delim(args$input, stringsAsFactors = FALSE, header = TRUE, sep = "\t") | |
60 | |
61 pmid_cols_index <- grep(c("PMID"), names(data)) | |
62 word_matrix <- data.frame() | |
63 dict_table <- data.frame() | |
64 pmids_count <- 0 | |
65 pubtator_max_ids <- 100 | |
66 | |
67 | |
68 merge_pubtator_table <- function(out_data, table) { | |
69 out_data <- unlist(strsplit(out_data, "\n", fixed = T)) | |
70 for (i in 3:length(out_data)) { | |
71 temps <- unlist(strsplit(out_data[i], "\t", fixed = T)) | |
72 if (length(temps) == 5) { | |
73 temps <- c(temps, NA) | |
74 } | |
75 if (length(temps) == 6) { | |
76 table <- rbind(table, temps) | |
77 } | |
78 } | |
79 return(table) | |
80 } | |
81 | |
82 | |
83 get_pubtator_terms <- function(pmids) { | |
84 table <- NULL | |
85 for (pmid_split in split(pmids, ceiling(seq_along(pmids) / pubtator_max_ids))) { | |
86 out_data <- NULL | |
87 try_num <- 1 | |
88 t_0 <- Sys.time() | |
89 while (TRUE) { | |
90 # Timing check: kill at 3 min | |
91 if (try_num > 1) { | |
92 cat("Connection problem. Please wait. Try number:", try_num, "\n") | |
93 Sys.sleep(time = 2 * try_num) | |
94 } | |
95 try_num <- try_num + 1 | |
96 t_1 <- Sys.time() | |
97 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) { | |
98 message("Killing the request! Something is not working. Please, try again later", "\n") | |
99 return(table) | |
100 } | |
101 out_data <- tryCatch({ | |
102 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", | |
103 paste(pmid_split, collapse = ","), sep = "")) | |
104 }, error = function(e) { | |
105 print(e) | |
106 next | |
107 }, finally = { | |
108 Sys.sleep(0) | |
109 }) | |
110 if (!is.null(out_data)) { | |
111 table <- merge_pubtator_table(out_data, table) | |
112 break | |
113 } | |
114 } | |
115 } | |
116 return(table) | |
117 } | |
118 | |
119 extract_category_terms <- function(table, categories) { | |
120 index_categories <- c() | |
121 categories <- as.character(unlist(categories)) | |
122 if (ncol(table) == 6) { | |
123 for (i in categories) { | |
124 tmp_index <- grep(TRUE, i == as.character(table[, 5])) | |
125 if (length(tmp_index) > 0) { | |
126 index_categories <- c(index_categories, tmp_index) | |
127 } | |
128 } | |
129 table <- as.data.frame(table, stringsAsFactors = FALSE) | |
130 table <- table[index_categories, c(4, 6)] | |
131 table <- table[!is.na(table[, 2]), ] | |
132 table <- table[!(table[, 2] == "NA"), ] | |
133 table <- table[!(table[, 1] == "NA"), ] | |
134 }else{ | |
135 return(NULL) | |
136 } | |
137 } | |
138 | |
139 extract_frequent_ids_or_terms <- function(table) { | |
140 if (is.null(table)) { | |
141 return(NULL) | |
142 break | |
143 } | |
144 if (args$byid) { | |
145 if (!is.null(args$number)) { | |
146 #retrieve top X mesh_ids | |
147 table_mesh <- as.data.frame(table(table[, 2])) | |
148 colnames(table_mesh)[1] <- "mesh_id" | |
149 table <- table[order(table_mesh$Freq, decreasing = TRUE), ] | |
150 table <- table[1:min(args$number, nrow(table_mesh)), ] | |
151 table_mesh$mesh_id <- as.character(table_mesh$mesh_id) | |
152 #subset table for top X mesh_ids | |
153 table <- table[which(as.character(table$V6) %in% as.character(table_mesh$mesh_id)), ] | |
154 table <- table[!duplicated(table[, 2]), ] | |
155 } else { | |
156 table <- table[!duplicated(table[, 2]), ] | |
157 } | |
158 } else { | |
159 if (!is.null(args$number)) { | |
160 table[, 1] <- tolower(as.character(table[, 1])) | |
161 table <- as.data.frame(table(table[, 1])) | |
162 colnames(table)[1] <- "term" | |
163 table <- table[order(table$Freq, decreasing = TRUE), ] | |
164 table <- table[1:min(args$number, nrow(table)), ] | |
165 table$term <- as.character(table$term) | |
166 } else { | |
167 table[, 1] <- tolower(as.character(table[, 1])) | |
168 table <- table[!duplicated(table[, 1]), ] | |
169 } | |
170 } | |
171 return(table) | |
172 } | |
173 | |
174 | |
175 #for all PMIDs of a row get PubTator terms and add them to the matrix | |
176 for (i in seq(nrow(data))) { | |
177 pmids <- as.character(data[i, pmid_cols_index]) | |
178 pmids <- pmids[!pmids == "NA"] | |
179 if (pmids_count > 10000) { | |
180 cat("Break (10s) to avoid killing of requests. Please wait.", "\n") | |
181 Sys.sleep(10) | |
182 pmids_count <- 0 | |
183 } | |
184 pmids_count <- pmids_count + length(pmids) | |
185 #get puptator terms and process them with functions | |
186 if (length(pmids) > 0) { | |
187 table <- get_pubtator_terms(pmids) | |
188 table <- extract_category_terms(table, args$categories) | |
189 table <- extract_frequent_ids_or_terms(table) | |
190 if (!is.null(table)) { | |
191 colnames(table) <- c("term", "mesh_id") | |
192 # add data in binary matrix | |
193 if (args$byid) { | |
194 mesh_ids <- as.character(table$mesh_id) | |
195 if (length(mesh_ids) > 0) { | |
196 word_matrix[i, mesh_ids] <- 1 | |
197 cat(length(mesh_ids), " IDs for PMIDs of row", i, " were added", "\n") | |
198 # add data in dictionary | |
199 dict_table <- rbind(dict_table, table) | |
200 dict_table <- dict_table[!duplicated(as.character(dict_table[, 2])), ] | |
201 } | |
202 } else { | |
203 terms <- as.character(table[, 1]) | |
204 if (length(terms) > 0) { | |
205 word_matrix[i, terms] <- 1 | |
206 cat(length(terms), " terms for PMIDs of row", i, " were added.", "\n") | |
207 } | |
208 } | |
209 } | |
210 } else { | |
211 cat("No terms for PMIDs of row", i, " were found.", "\n") | |
212 } | |
213 } | |
214 | |
215 if (args$byid) { | |
216 #change column names of matrix: exchange mesh ids/ids with term | |
217 index_names <- match(names(word_matrix), as.character(dict_table[[2]])) | |
218 names(word_matrix) <- dict_table[index_names, 1] | |
219 } | |
220 | |
221 colnames(word_matrix) <- gsub("[^[:print:]]", "", colnames(word_matrix)) | |
222 colnames(word_matrix) <- gsub('\"', "", colnames(word_matrix), fixed = TRUE) | |
223 | |
224 #merge duplicated columns | |
225 word_matrix <- as.data.frame(do.call(cbind, by(t(word_matrix), INDICES = names(word_matrix), FUN = colSums))) | |
226 | |
227 #save binary matrix | |
228 word_matrix <- as.matrix(word_matrix) | |
229 word_matrix[is.na(word_matrix)] <- 0 | |
230 cat("Matrix with ", nrow(word_matrix), " rows and ", ncol(word_matrix), " columns generated.", "\n") | |
231 write.table(word_matrix, args$output, row.names = FALSE, sep = "\t", quote = FALSE) |