Mercurial > repos > dlalgroup > simtext_app
comparison pmids_to_pubtator_matrix.R @ 0:34ed44f3f85c draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:17:05 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:34ed44f3f85c |
---|---|
1 #!/usr/bin/env Rscript | |
2 #tool: pmids_to_pubtator_matrix | |
3 # | |
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the | |
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted. | |
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term | |
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as | |
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID). | |
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row. | |
10 # | |
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs. | |
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc. | |
13 # | |
14 #Output: Binary matrix in that each column represents one of the extracted terms. | |
15 # | |
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] | |
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]] | |
18 # | |
19 # optional arguments: | |
20 # -h, --help show help message | |
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory | |
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted. | |
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"] | |
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...] | |
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"] | |
26 | |
27 if ( '--install_packages' %in% commandArgs()) { | |
28 print('Installing packages') | |
29 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); | |
30 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/"); | |
31 if (!require('RCurl')) install.packages('RCurl',repo="http://cran.rstudio.com/"); | |
32 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/"); | |
33 } | |
34 | |
35 suppressPackageStartupMessages(library("argparse")) | |
36 library('stringr') | |
37 library('stringi') | |
38 library('RCurl') | |
39 | |
40 parser <- ArgumentParser() | |
41 | |
42 parser$add_argument("-i", "--input", | |
43 help = "input fie name. add path if file is not in workind directory") | |
44 parser$add_argument("-o", "--output", default="pmids_to_pubtator_matrix_output", | |
45 help = "output file name. [default \"%(default)s\"]") | |
46 parser$add_argument("-c", "--categories", choices=c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs="+", | |
47 default= c("Gene", "Disease", "Mutation", "Chemical"), | |
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]") | |
49 parser$add_argument("-b", "--byid", action="store_true", default=FALSE, | |
50 help="If you want to find common gene IDs / mesh IDs instead of scientific terms.") | |
51 parser$add_argument("-n", "--number", default=NULL, type="integer", | |
52 help="Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.") | |
53 parser$add_argument("--install_packages", action="store_true", default=FALSE, | |
54 help="If you want to auto install missing required packages.") | |
55 | |
56 args <- parser$parse_args() | |
57 | |
58 | |
59 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t') | |
60 | |
61 pmid_cols_index <- grep(c("PMID"), names(data)) | |
62 word_matrix = data.frame() | |
63 dict.table = data.frame() | |
64 pmids_count <- 0 | |
65 pubtator_max_ids = 100 | |
66 | |
67 get_pubtator_terms = function(pmids, categories){ | |
68 | |
69 table = NULL | |
70 for (pmid_split in split(pmids, ceiling(seq_along(pmids)/pubtator_max_ids))){ | |
71 out.data = NULL | |
72 try_num <- 1 | |
73 t_0 <- Sys.time() | |
74 | |
75 while(TRUE) { | |
76 | |
77 # Timing check: kill at 3 min | |
78 if (try_num > 1){ | |
79 cat("Connection problem. Please wait. Try number:",try_num,"\n") | |
80 Sys.sleep(time = 2*try_num) | |
81 } | |
82 try_num <- try_num + 1 | |
83 | |
84 t_1 <- Sys.time() | |
85 | |
86 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | |
87 message("Killing the request! Something is not working. Please, try again later","\n") | |
88 return(table) | |
89 } | |
90 out.data <- tryCatch({ | |
91 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", | |
92 paste(pmid_split, collapse=","), sep = "")) | |
93 }, error = function(e) { | |
94 print(e) | |
95 next | |
96 }, finally = { | |
97 Sys.sleep(0) | |
98 }) | |
99 | |
100 if(!is.null(out.data)){ | |
101 out.data = unlist(strsplit(out.data, "\n", fixed = T)) | |
102 | |
103 # skip first few lines, is this needed? | |
104 for (i in 3:length(out.data)) { | |
105 temps = unlist(strsplit(out.data[i], "\t", fixed = T)) | |
106 if (length(temps) == 5) { | |
107 # make 5 be 6 | |
108 temps = c(temps, NA) | |
109 } | |
110 if (length(temps) == 6) { | |
111 # keep only 6 | |
112 table = rbind(table, temps) | |
113 } | |
114 } | |
115 break | |
116 } | |
117 | |
118 } #end while loop | |
119 } | |
120 | |
121 index.categories = c() | |
122 categories = as.character(unlist(categories)) | |
123 | |
124 if(ncol(table) == 6){ | |
125 | |
126 for(i in categories){ | |
127 tmp.index = grep(TRUE, i == as.character(table[,5])) | |
128 | |
129 if(length(tmp.index) > 0){ | |
130 index.categories = c(index.categories,tmp.index) | |
131 } | |
132 } | |
133 | |
134 table = as.data.frame(table, stringsAsFactors=FALSE) | |
135 table = table[index.categories,c(4,6)] | |
136 table = table[!is.na(table[,2]),] | |
137 table = table[!(table[,2] == "NA"),] | |
138 table = table[!(table[,1] == "NA"),] | |
139 | |
140 if(args$byid){ | |
141 if(!is.null(args$number)){ | |
142 #retrieve top X mesh.ids | |
143 table.mesh = as.data.frame(table(table[,2])) | |
144 colnames(table.mesh)[1] = "mesh.id" | |
145 table = table[order(table.mesh$Freq, decreasing = TRUE),] | |
146 table = table[1:min(args$number, nrow(table.mesh)),] | |
147 table.mesh$mesh.id = as.character(table.mesh$mesh.id) | |
148 #subset table for top X mesh.ids | |
149 table = table[which(as.character(table$V6) %in% as.character(table.mesh$mesh.id)),] | |
150 table = table[!duplicated(table[,2]),] | |
151 }else{ | |
152 table = table[!duplicated(table[,2]),] | |
153 } | |
154 } else { | |
155 if(!is.null(args$number)){ | |
156 table[,1] = tolower(as.character(table[,1])) | |
157 table = as.data.frame(table(table[,1])) | |
158 colnames(table)[1] = "term" | |
159 table = table[order(table$Freq, decreasing = TRUE),] | |
160 table = table[1:min(args$number, nrow(table)),] | |
161 table$term = as.character(table$term) | |
162 | |
163 }else{ | |
164 table[,1] = tolower(as.character(table[,1])) | |
165 table = table[!duplicated(table[,1]),] | |
166 } | |
167 } | |
168 | |
169 return(table) | |
170 | |
171 } else { | |
172 return(NULL) | |
173 } | |
174 } | |
175 | |
176 | |
177 #for all PMIDs of a row get PubTator terms and add them to the matrix | |
178 for (i in 1:nrow(data)){ | |
179 | |
180 pmids = as.character(data[i,pmid_cols_index]) | |
181 pmids = pmids[!pmids == "NA"] | |
182 | |
183 | |
184 if ( (pmids_count > 10000)){ | |
185 cat("Break (10s) to avoid killing of requests. Please wait.",'\n') | |
186 Sys.sleep(10) | |
187 pmids_count = 0 | |
188 } | |
189 | |
190 pmids_count = pmids_count + length(pmids) | |
191 | |
192 #get puptator terms with get_pubtator_terms function | |
193 if (length(pmids) >0){ | |
194 table = get_pubtator_terms(pmids, args$categories) | |
195 | |
196 if(!is.null(table)){ | |
197 | |
198 colnames(table)= c("term","mesh.id") | |
199 | |
200 # add data in binary matrix | |
201 if (args$byid){ | |
202 mesh.ids = as.character(table$mesh.id) | |
203 if (length(mesh.ids) > 0 ){ | |
204 word_matrix[i,mesh.ids] <- 1 | |
205 cat(length(mesh.ids), " IDs for PMIDs of row", i," were added",'\n') | |
206 # add data in dictionary | |
207 dict.table = rbind(dict.table, table) | |
208 dict.table = dict.table[!duplicated(as.character(dict.table[,2])),] | |
209 } | |
210 } else { | |
211 terms = as.character(table[,1]) | |
212 if (length(terms) > 0 ){ | |
213 word_matrix[i,terms] <- 1 | |
214 cat(length(terms), " terms for PMIDs of row", i," were added.",'\n') | |
215 } | |
216 } | |
217 } | |
218 | |
219 } else { | |
220 cat("No terms for PMIDs of row", i," were found.",'\n') | |
221 } | |
222 } | |
223 | |
224 if (args$byid){ | |
225 #change column names of matrix: exchange mesh ids/ids with term | |
226 index_names = match(names(word_matrix), as.character(dict.table[[2]])) | |
227 names(word_matrix) = dict.table[index_names,1] | |
228 } | |
229 | |
230 colnames(word_matrix) = gsub("[^[:print:]]","",colnames(word_matrix)) | |
231 colnames(word_matrix) = gsub('\"', "", colnames(word_matrix), fixed = TRUE) | |
232 | |
233 #merge duplicated columns | |
234 word_matrix = as.data.frame(do.call(cbind, by(t(word_matrix),INDICES=names(word_matrix),FUN=colSums))) | |
235 | |
236 #save binary matrix | |
237 word_matrix <- as.matrix(word_matrix) | |
238 word_matrix[is.na(word_matrix)] <- 0 | |
239 cat("Matrix with ",nrow(word_matrix)," rows and ",ncol(word_matrix)," columns generated.","\n") | |
240 #write.table(word_matrix, args$output) | |
241 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t') | |
242 | |
243 | |
244 | |
245 | |
246 | |
247 |