Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/utils.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d1b9e1b2e2f |
---|---|
1 #!/usr/bin/env Rscript | |
2 suppressPackageStartupMessages(library(DBI)) | |
3 suppressPackageStartupMessages(library(RSQLite)) | |
4 | |
5 CONNECTED = FALSE | |
6 if (FALSE) { | |
7 ## for testing | |
8 seqdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/sequences.db" | |
9 hitsortdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/hitsort.db" | |
10 class_file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree.rds" | |
11 ## connect to sqlite databases | |
12 SEQDB = dbConnect(RSQLite::SQLite(), seqdb) | |
13 HITSORTDB = dbConnect(RSQLite::SQLite(), hitsortdb) | |
14 CLS_TREE = readRDS(class_file) | |
15 } | |
16 | |
17 connect_to_databases = function(seqdb, hitsortdb,classification_hierarchy_file = NULL){ | |
18 if (!CONNECTED){ | |
19 SEQDB <<- dbConnect(RSQLite::SQLite(), seqdb) | |
20 HITSORTDB <<- dbConnect(RSQLite::SQLite(), hitsortdb) | |
21 if (!is.null(classification_hierarchy_file)){ | |
22 CLS_TREE <<- readRDS(classification_hierarchy_file) | |
23 } | |
24 CONNECTED <<- TRUE | |
25 } | |
26 } | |
27 | |
28 disconnect_database = function(){ | |
29 if (CONNECTED){ | |
30 dbDisconnect(SEQDB) | |
31 dbDisconnect(HITSORTDB) | |
32 CONNECTED <<- FALSE | |
33 } | |
34 } | |
35 | |
36 nested2named_list = function(x){ | |
37 y = as.list(unlist(x[[1]])) | |
38 names(y) = unlist(x[[2]]) | |
39 return(y) | |
40 } | |
41 | |
42 is_comparative = function(){ | |
43 prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes") | |
44 if (nrow(prefix_codes) == 0){ | |
45 return(FALSE) | |
46 }else{ | |
47 return(TRUE) | |
48 } | |
49 } | |
50 | |
51 get_comparative_codes = function(){ | |
52 prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes") | |
53 return(prefix_codes) | |
54 } | |
55 | |
56 add_preamble = function(html_file, preamble){ | |
57 html_content=readLines(html_file) | |
58 modified_html_content = gsub("<body>", | |
59 paste("<body>\n", preamble,"\n"), | |
60 html_content) | |
61 cat(modified_html_content, file = html_file, sep="\n") | |
62 } | |
63 | |
64 | |
65 df2html = function(df, header = NULL, sort_col = NULL, digits = 3, rounding_function=signif, decreasing = TRUE, scroling = FALSE, width = 300){ | |
66 if (!is.null(sort_col)){ | |
67 df = df[order(df[,sort_col], decreasing = decreasing),] | |
68 } | |
69 if (!is.null(digits)){ | |
70 for (i in seq_along(df)){ | |
71 if(is.numeric(df[,i])){ | |
72 df[,i] = rounding_function(df[,i], digits) | |
73 } | |
74 } | |
75 } | |
76 if (is.null(header)){ | |
77 h = "" | |
78 }else{ | |
79 h = paste0(" <th>",header,"</th>\n", collapse="") %>% | |
80 paste0(" <tr>\n", .," </tr>\n") | |
81 } | |
82 x = apply(df,1,function(x)paste0(" <td>",x,"</td>\n", collapse="")) %>% | |
83 paste0(" <tr>\n", .," </tr>\n", collapse = "") | |
84 if (scroling){ | |
85 cols = paste0('<col width="',rep(round(100/ncol(df)),ncol(df)),'%">\n',collapse ="") | |
86 height = min(200, 22 * nrow(df)) | |
87 out = paste0( | |
88 '<table cellspacing="0" cellpadding="0" border="0" width="',width,'">\n', | |
89 ' <tr>\n', | |
90 ' <td>\n', | |
91 ' <table cellspacing="0" cellpadding="1" border="1" width="', width,'" >\n', | |
92 cols, | |
93 h, | |
94 ' </table>\n', | |
95 ' </td>\n', | |
96 ' </tr>\n', | |
97 ' <tr>\n', | |
98 ' <td>\n', | |
99 ' <div style="width:',width,'px; height:',height,'px; overflow:auto;">\n', | |
100 ' <table cellspacing="0" cellpadding="1" border="1" width="',width,'" >\n', | |
101 cols, | |
102 x, | |
103 ' </table>\n', | |
104 ' </div>\n', | |
105 ' </td>\n', | |
106 ' </tr>\n', | |
107 '</table>\n' | |
108 ) | |
109 | |
110 }else{ | |
111 out = paste ("<table>\n", h,x, "</table>\n") | |
112 } | |
113 return(out) | |
114 } | |
115 | |
116 start_html = function(filename, header){ | |
117 cat(header, file = filename) | |
118 html_writer = function(content, fn=HTML, ...){ | |
119 fn(content, append = TRUE, file = filename, ...) | |
120 } | |
121 } | |
122 | |
123 preformatted = function(x){ | |
124 ## make preformatted html text | |
125 return( | |
126 paste( | |
127 "<pre>\n", | |
128 x, | |
129 "</pre>" | |
130 ,sep="") | |
131 ) | |
132 } | |
133 | |
134 | |
135 summary_histogram = function(fn, N_clustering, N_omit=0, size_adjusted=NULL, top_clusters_prop){ | |
136 ## assume connection do databases | |
137 communities = dbGetQuery( | |
138 HITSORTDB, | |
139 "SELECT DISTINCT cluster, size, supercluster, supercluster_size FROM communities ORDER BY supercluster" | |
140 ) | |
141 if (N_omit != 0){ | |
142 ## adjust communities and cluster sizes: | |
143 cluster_to_adjust = which( | |
144 communities$size[order(communities$cluster)][1:length(size_adjusted)] != size_adjusted | |
145 ) | |
146 ## keep original value: | |
147 communities$size_original = communities$size | |
148 superclusters_to_adjust = unique(communities$supercluster[communities$cluster %in% cluster_to_adjust]) | |
149 for (cl in cluster_to_adjust){ | |
150 communities[communities$cluster == cl,'size'] = size_adjusted[cl] | |
151 } | |
152 for (cl in superclusters_to_adjust){ | |
153 communities[communities$supercluster == cl,'supercluster_size'] = | |
154 sum(communities[communities$supercluster == cl,'size']) | |
155 } | |
156 }else{ | |
157 cluster_to_adjust=NULL | |
158 } | |
159 singlets = N_clustering - sum(communities$size) | |
160 | |
161 supercluster_size = sort(unique(communities[, c('supercluster', 'supercluster_size')])$supercluster_size, decreasing = TRUE) | |
162 | |
163 | |
164 clid2size = sort(communities$size, decreasing = TRUE) | |
165 | |
166 cluster_id = split(communities$cluster, communities$supercluster) | |
167 cluster_id_sort = lapply(cluster_id, function(x)x[order(clid2size[x], decreasing = FALSE)]) | |
168 | |
169 cluster_size_unsorted = split(communities$size, communities$supercluster) | |
170 cluster_size_sort = lapply(cluster_size_unsorted, function(x) (sort(x))) | |
171 ## reorder by size of superclusters | |
172 cluster_size_sort_sort = cluster_size_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)] | |
173 cluster_id_sort_sort = cluster_id_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)] | |
174 | |
175 | |
176 Nmax = max(sapply(cluster_size_sort_sort, length)) | |
177 M = cbind( | |
178 sapply(cluster_size_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))), | |
179 c(1, rep(0, Nmax)) | |
180 ) | |
181 | |
182 Mid = cbind( | |
183 sapply(cluster_id_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))), | |
184 c(1, rep(0, Nmax)) | |
185 ) | |
186 | |
187 recolor = matrix(ifelse(Mid %in% cluster_to_adjust,TRUE,FALSE), ncol=ncol(Mid)) | |
188 indices = which(recolor, arr.ind = TRUE) | |
189 | |
190 | |
191 png(fn, width = 1200, height = 700, pointsize = 20) | |
192 barplot(M, | |
193 width = c(supercluster_size, singlets), | |
194 space = 0, ylim = c(0, max(supercluster_size) * 1.2), | |
195 col = c("#CCCCCC"), names.arg = rep("", ncol(M))) | |
196 plot(0, | |
197 xlim = c(0, sum(c(supercluster_size, singlets))), | |
198 ylim = c(0, max(supercluster_size) * 1.2), | |
199 type = "n", yaxs = 'i', axes = FALSE, xlab = "", ylab = "") | |
200 | |
201 rect(0, 0, | |
202 sum(supercluster_size), | |
203 max(supercluster_size) * 1.2, | |
204 col = "#0000FF10") | |
205 | |
206 rect(sum(supercluster_size), 0, | |
207 sum(supercluster_size) + singlets, | |
208 max(supercluster_size) * 1.2, | |
209 col = "#FFAAFF10") | |
210 | |
211 barplot(M, | |
212 width = c(supercluster_size, singlets), | |
213 space = 0, ylim = c(0, max(supercluster_size) * 1.2), | |
214 col = "#AAAAAA", names.arg = rep("", ncol(M)), add = TRUE, | |
215 xlab = "Proportion of reads [%]", ylab = "Number of reads", | |
216 main = paste(N_clustering, "reads total")) | |
217 | |
218 | |
219 for (i in seq_along(indices[,1])){ | |
220 y1 = sum(M[1:indices[i,'row'],indices[i,'col']]) | |
221 x1 = sum(M[,1:indices[i,'col']]) | |
222 if(indices[i,'row'] == 1){ | |
223 y0=0 | |
224 }else{ | |
225 y0 = sum(M[1:(indices[i,'row']-1),indices[i,'col']]) | |
226 } | |
227 if (indices[i,'col']==1){ | |
228 x0=0 | |
229 }else{ | |
230 x0 = sum(M[,1:(indices[i,'col']-1)]) | |
231 } | |
232 rect(x0,y0,x1,y1, col="#88FF88") | |
233 } | |
234 abline(v=top_clusters_prop * sum(c(supercluster_size, singlets)), col="#00000088", lwd=3, lty=3) | |
235 | |
236 text(sum(supercluster_size) / 2, | |
237 max(supercluster_size) * 1.05, | |
238 labels = paste0(sum(supercluster_size), " reads in\n", | |
239 length(supercluster_size), " supeclusters (", nrow(communities), " clusters)") | |
240 ) | |
241 | |
242 | |
243 text(sum(supercluster_size) + singlets / 2, | |
244 max(supercluster_size) * 1.05, | |
245 labels = paste(singlets, "singlets")) | |
246 | |
247 axis(1,at=seq(0,N_clustering,length.out=11),label=seq(0,100,by=10)) | |
248 dev.off() | |
249 clustering_info = list( | |
250 Number_of_reads_in_clusters = sum(supercluster_size), | |
251 Number_of_clusters = nrow(communities), | |
252 Number_of_superclusters = length(supercluster_size), | |
253 Number_of_singlets = singlets | |
254 ) | |
255 return(clustering_info) | |
256 } | |
257 | |
258 | |
259 rectMap=function(x,scale.by='row',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),cexx=NULL,cexy=NULL){ | |
260 if (scale.by=='row'){ | |
261 #x=(x)/rowSums(x) | |
262 x=(x)/apply(x,1,max) | |
263 } | |
264 if (scale.by=='column'){ | |
265 x=t(t(x)/apply(x,2,max)) | |
266 } | |
267 nc=ncol(x) | |
268 nr=nrow(x) | |
269 coords=expand.grid(1:nr,1:nc) | |
270 plot(coords[,1],coords[,2],type='n',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab) | |
271 axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr)) | |
272 axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7) | |
273 axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7) | |
274 | |
275 mtext(side = 1, "Cluster id", las=1, line = 3, cex = 0.5) | |
276 line = 1.5 + log10(nr) | |
277 mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5) | |
278 s=c(x)/2 # to get it proportional | |
279 w = c(x)/2 | |
280 rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA) | |
281 if (grid){ | |
282 abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030") | |
283 } | |
284 box(col="#60606030",lty=2) | |
285 } | |
286 | |
287 plot_rect_map = function(read_counts,cluster_annotation, output_file,Xcoef=1,Ycoef=1){ | |
288 counts = read.table(read_counts,header=TRUE,as.is=TRUE) | |
289 annot = read.table(cluster_annotation, sep="\t",header=FALSE,as.is=TRUE) | |
290 N = nrow(annot) | |
291 colnames(annot) = c("cluster", "Automatic.classification") | |
292 annot$number.of.reads = rowSums(counts[1 : nrow(annot) ,-1]) | |
293 unique_repeats = names(sort(table(c(annot$Automatic.classification,rep('nd',N))),decreasing = TRUE)) | |
294 | |
295 M = as.matrix(counts[1:N,-(1:2)]) | |
296 rownames(M) = paste0("CL",rownames(M)) | |
297 Mn1=(M)/apply(M,1,max) | |
298 Mn2=M/max(M) | |
299 Mn2=M/apply(M,1,sum) | |
300 | |
301 ord1 = hclust(dist(Mn1),method = "ward.D")$order | |
302 ord2 = hclust(dist(t(Mn2)))$order | |
303 wdth = (400 + N*10 ) * Xcoef | |
304 hgt = (600 + ncol(M)*50) * Ycoef | |
305 ptsize = round((wdth*hgt)^(1/4)) | |
306 png(output_file, width=wdth,height=hgt, pointsize = ptsize) # was 50 | |
307 ploting_area_width = 3 + log10(N)*3 | |
308 ploting_area_sides = 1 | |
309 layout(matrix(c(4,2,3,4,1,3),ncol=3,byrow = TRUE), | |
310 width=c(ploting_area_sides,ploting_area_width,ploting_area_sides), | |
311 height=c(3,ncol(M)*0.5)) | |
312 par(xaxs='i', yaxs = 'i') | |
313 par(las=2,mar=c(4,0,0,0),cex.axis=0.5) | |
314 rectMap(Mn2[ord1,ord2],scale.by='none',col=1, grid=TRUE) | |
315 par(las=2,mar=c(1,0,1,0), mgp = c(2,0.5,0)) | |
316 barplot(annot$number.of.reads[ord1], col = 1) | |
317 mtext(side = 2, "Cluster size", las = 3, line = 2, cex = 0.5) | |
318 par(mar=c(0,0,10,0)) | |
319 plot.new() | |
320 st = dev.off() | |
321 ## calculate coordinated if boxes to create hyperlink | |
322 X0 = wdth/(ploting_area_sides * 2 + ploting_area_width)* ploting_area_sides | |
323 X1 = wdth/(ploting_area_sides * 2 + ploting_area_width)*(ploting_area_sides + ploting_area_width) | |
324 L = round(seq(X0,X1, length.out = N + 1)[1:N]) | |
325 R = round(seq(X0,X1, length.out = N + 1)[2:(N + 1)]) | |
326 cn = rownames(Mn2[ord1,ord2]) | |
327 cluster_links = paste0( | |
328 "seqclust/clustering/clusters/dir_CL", | |
329 sprintf("%04d", as.integer(substring(cn,3 ))), | |
330 "/index.html") | |
331 coords = paste0(L, ",", 1, ",", R, ",", hgt) | |
332 clustermap = paste0( | |
333 '\n<map name="clustermap"> \n', | |
334 paste0( | |
335 '<area shape="rect"\n coords="',coords, '"\n', | |
336 ' href="', cluster_links, '"\n', | |
337 ' title="', cn, '"/>\n', | |
338 collapse = ""), | |
339 "</map>\n") | |
340 return(clustermap) | |
341 } |