comparison pmids_to_pubtator_matrix.R @ 0:34ed44f3f85c draft

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author dlalgroup
date Thu, 24 Sep 2020 02:17:05 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:34ed44f3f85c
1 #!/usr/bin/env Rscript
2 #tool: pmids_to_pubtator_matrix
3 #
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted.
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID).
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row.
10 #
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs.
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc.
13 #
14 #Output: Binary matrix in that each column represents one of the extracted terms.
15 #
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER]
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]]
18 #
19 # optional arguments:
20 # -h, --help show help message
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"]
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...]
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"]
26
27 if ( '--install_packages' %in% commandArgs()) {
28 print('Installing packages')
29 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/");
30 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/");
31 if (!require('RCurl')) install.packages('RCurl',repo="http://cran.rstudio.com/");
32 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/");
33 }
34
35 suppressPackageStartupMessages(library("argparse"))
36 library('stringr')
37 library('stringi')
38 library('RCurl')
39
40 parser <- ArgumentParser()
41
42 parser$add_argument("-i", "--input",
43 help = "input fie name. add path if file is not in workind directory")
44 parser$add_argument("-o", "--output", default="pmids_to_pubtator_matrix_output",
45 help = "output file name. [default \"%(default)s\"]")
46 parser$add_argument("-c", "--categories", choices=c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs="+",
47 default= c("Gene", "Disease", "Mutation", "Chemical"),
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]")
49 parser$add_argument("-b", "--byid", action="store_true", default=FALSE,
50 help="If you want to find common gene IDs / mesh IDs instead of scientific terms.")
51 parser$add_argument("-n", "--number", default=NULL, type="integer",
52 help="Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.")
53 parser$add_argument("--install_packages", action="store_true", default=FALSE,
54 help="If you want to auto install missing required packages.")
55
56 args <- parser$parse_args()
57
58
59 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t')
60
61 pmid_cols_index <- grep(c("PMID"), names(data))
62 word_matrix = data.frame()
63 dict.table = data.frame()
64 pmids_count <- 0
65 pubtator_max_ids = 100
66
67 get_pubtator_terms = function(pmids, categories){
68
69 table = NULL
70 for (pmid_split in split(pmids, ceiling(seq_along(pmids)/pubtator_max_ids))){
71 out.data = NULL
72 try_num <- 1
73 t_0 <- Sys.time()
74
75 while(TRUE) {
76
77 # Timing check: kill at 3 min
78 if (try_num > 1){
79 cat("Connection problem. Please wait. Try number:",try_num,"\n")
80 Sys.sleep(time = 2*try_num)
81 }
82 try_num <- try_num + 1
83
84 t_1 <- Sys.time()
85
86 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
87 message("Killing the request! Something is not working. Please, try again later","\n")
88 return(table)
89 }
90 out.data <- tryCatch({
91 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=",
92 paste(pmid_split, collapse=","), sep = ""))
93 }, error = function(e) {
94 print(e)
95 next
96 }, finally = {
97 Sys.sleep(0)
98 })
99
100 if(!is.null(out.data)){
101 out.data = unlist(strsplit(out.data, "\n", fixed = T))
102
103 # skip first few lines, is this needed?
104 for (i in 3:length(out.data)) {
105 temps = unlist(strsplit(out.data[i], "\t", fixed = T))
106 if (length(temps) == 5) {
107 # make 5 be 6
108 temps = c(temps, NA)
109 }
110 if (length(temps) == 6) {
111 # keep only 6
112 table = rbind(table, temps)
113 }
114 }
115 break
116 }
117
118 } #end while loop
119 }
120
121 index.categories = c()
122 categories = as.character(unlist(categories))
123
124 if(ncol(table) == 6){
125
126 for(i in categories){
127 tmp.index = grep(TRUE, i == as.character(table[,5]))
128
129 if(length(tmp.index) > 0){
130 index.categories = c(index.categories,tmp.index)
131 }
132 }
133
134 table = as.data.frame(table, stringsAsFactors=FALSE)
135 table = table[index.categories,c(4,6)]
136 table = table[!is.na(table[,2]),]
137 table = table[!(table[,2] == "NA"),]
138 table = table[!(table[,1] == "NA"),]
139
140 if(args$byid){
141 if(!is.null(args$number)){
142 #retrieve top X mesh.ids
143 table.mesh = as.data.frame(table(table[,2]))
144 colnames(table.mesh)[1] = "mesh.id"
145 table = table[order(table.mesh$Freq, decreasing = TRUE),]
146 table = table[1:min(args$number, nrow(table.mesh)),]
147 table.mesh$mesh.id = as.character(table.mesh$mesh.id)
148 #subset table for top X mesh.ids
149 table = table[which(as.character(table$V6) %in% as.character(table.mesh$mesh.id)),]
150 table = table[!duplicated(table[,2]),]
151 }else{
152 table = table[!duplicated(table[,2]),]
153 }
154 } else {
155 if(!is.null(args$number)){
156 table[,1] = tolower(as.character(table[,1]))
157 table = as.data.frame(table(table[,1]))
158 colnames(table)[1] = "term"
159 table = table[order(table$Freq, decreasing = TRUE),]
160 table = table[1:min(args$number, nrow(table)),]
161 table$term = as.character(table$term)
162
163 }else{
164 table[,1] = tolower(as.character(table[,1]))
165 table = table[!duplicated(table[,1]),]
166 }
167 }
168
169 return(table)
170
171 } else {
172 return(NULL)
173 }
174 }
175
176
177 #for all PMIDs of a row get PubTator terms and add them to the matrix
178 for (i in 1:nrow(data)){
179
180 pmids = as.character(data[i,pmid_cols_index])
181 pmids = pmids[!pmids == "NA"]
182
183
184 if ( (pmids_count > 10000)){
185 cat("Break (10s) to avoid killing of requests. Please wait.",'\n')
186 Sys.sleep(10)
187 pmids_count = 0
188 }
189
190 pmids_count = pmids_count + length(pmids)
191
192 #get puptator terms with get_pubtator_terms function
193 if (length(pmids) >0){
194 table = get_pubtator_terms(pmids, args$categories)
195
196 if(!is.null(table)){
197
198 colnames(table)= c("term","mesh.id")
199
200 # add data in binary matrix
201 if (args$byid){
202 mesh.ids = as.character(table$mesh.id)
203 if (length(mesh.ids) > 0 ){
204 word_matrix[i,mesh.ids] <- 1
205 cat(length(mesh.ids), " IDs for PMIDs of row", i," were added",'\n')
206 # add data in dictionary
207 dict.table = rbind(dict.table, table)
208 dict.table = dict.table[!duplicated(as.character(dict.table[,2])),]
209 }
210 } else {
211 terms = as.character(table[,1])
212 if (length(terms) > 0 ){
213 word_matrix[i,terms] <- 1
214 cat(length(terms), " terms for PMIDs of row", i," were added.",'\n')
215 }
216 }
217 }
218
219 } else {
220 cat("No terms for PMIDs of row", i," were found.",'\n')
221 }
222 }
223
224 if (args$byid){
225 #change column names of matrix: exchange mesh ids/ids with term
226 index_names = match(names(word_matrix), as.character(dict.table[[2]]))
227 names(word_matrix) = dict.table[index_names,1]
228 }
229
230 colnames(word_matrix) = gsub("[^[:print:]]","",colnames(word_matrix))
231 colnames(word_matrix) = gsub('\"', "", colnames(word_matrix), fixed = TRUE)
232
233 #merge duplicated columns
234 word_matrix = as.data.frame(do.call(cbind, by(t(word_matrix),INDICES=names(word_matrix),FUN=colSums)))
235
236 #save binary matrix
237 word_matrix <- as.matrix(word_matrix)
238 word_matrix[is.na(word_matrix)] <- 0
239 cat("Matrix with ",nrow(word_matrix)," rows and ",ncol(word_matrix)," columns generated.","\n")
240 #write.table(word_matrix, args$output)
241 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t')
242
243
244
245
246
247