Mercurial > repos > alvarofaure > bitlab
comparison chromeister/bin/compute_score.R @ 0:7fdf47a0bae8 draft
Uploaded
author | alvarofaure |
---|---|
date | Wed, 12 Dec 2018 07:18:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7fdf47a0bae8 |
---|---|
1 | |
2 #!/usr/bin/env Rscript | |
3 args = commandArgs(trailingOnly=TRUE) | |
4 | |
5 if(length(args) < 2){ | |
6 stop("USE: Rscript --vanilla plot.R <matrix> <matsize>") | |
7 } | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 growing_regions <- function(mat, reward = 6, penalty = 5, sidePenalty = 3, MAXHSPS = 500, TH = 10, WSIZE = 7){ | |
15 | |
16 #write(mat, file ="data.txt", ncolumns = 200) | |
17 | |
18 len <- min(length(mat[1,]), length(mat[,1])) | |
19 HSPS <- matrix(0, nrow=MAXHSPS, ncol=5) | |
20 idx <- 1 | |
21 lH <- round(WSIZE/2) - 1 | |
22 rH <- round(WSIZE/2) + 1 | |
23 | |
24 | |
25 #print(paste("lH: ", lH, "rH: ", rH)) | |
26 | |
27 if(WSIZE %% 2 == 0){ | |
28 print("WSIZE MUST BE ODD") | |
29 stop() | |
30 } | |
31 | |
32 | |
33 | |
34 i <- 1 | |
35 #readline(prompt="Press [enter] to continue") | |
36 while(i < len){ | |
37 | |
38 value <- max(mat[i,]) * reward | |
39 if(value == 0) i <- i + 1 | |
40 pos <- which.max(mat[i,]) | |
41 # these two hold ending frag | |
42 endfrag <- pos | |
43 j <- i | |
44 count_penalties <- 1 | |
45 | |
46 #print("-----------------") | |
47 | |
48 while(value > 0 && j < len){ | |
49 | |
50 #print(paste("took values ", j, endfrag, "which have score of ", mat[j, endfrag], "current value is: ", value)) | |
51 | |
52 | |
53 | |
54 | |
55 # Reset position used | |
56 | |
57 mat[max(1,j-1), max(1,endfrag-2)] <- 0 | |
58 mat[max(1,j-1), max(1,endfrag-1)] <- 0 | |
59 mat[max(1,j-1), endfrag] <- 0 | |
60 mat[max(1,j-1), min(len, endfrag+1)] <- 0 | |
61 mat[max(1,j-1), min(len, endfrag+2)] <- 0 | |
62 | |
63 mat[j, max(1,endfrag-2)] <- 0 | |
64 mat[j, max(1,endfrag-1)] <- 0 | |
65 mat[j, endfrag] <- 0 | |
66 mat[j, min(len, endfrag+1)] <- 0 | |
67 mat[j, min(len, endfrag+2)] <- 0 | |
68 | |
69 #print(paste("Erasing: (",max(1,j-1), max(1,endfrag-2),")(",max(1,j-1), max(1,endfrag-1),")(",max(1,j-1), endfrag, | |
70 # ")(",max(1,j-1), min(len, endfrag+1),")(",max(1,j-1), min(len, endfrag+2),")(",j, max(1,endfrag-2), | |
71 # ")(",j, max(1,endfrag-1),")(",j, endfrag,")(",j, min(len, endfrag+1),")(",j, min(len, endfrag+2),")")) | |
72 | |
73 | |
74 # Go for next | |
75 j <- j + 1 | |
76 # Check next, could be reverse or forward | |
77 mleft <- max(1, endfrag-lH) | |
78 mright <- min(len, endfrag+lH) | |
79 window <- mat[j, mleft:mright] | |
80 | |
81 | |
82 #print(paste("mleft: ", mleft, "mright", mright, "j is: ", j)) | |
83 #print(window) | |
84 | |
85 v <- max(window) | |
86 selected <- which.max(window) | |
87 # Make it rather go diagonally | |
88 #print(paste("WIndow len: ", length(window))) | |
89 chose_diagonal <- FALSE | |
90 if(length(window) == WSIZE && v == window[lH]){ | |
91 selected <- lH | |
92 chose_diagonal <- TRUE | |
93 } | |
94 | |
95 if(length(window) == WSIZE && v == window[rH]){ | |
96 selected <- rH | |
97 chose_diagonal <- TRUE | |
98 } | |
99 | |
100 #print(paste("Selected value be like ", selected)) | |
101 | |
102 | |
103 if(v != 0){ | |
104 | |
105 | |
106 | |
107 endfrag <- (mleft + selected ) # To make the indexing | |
108 if(length(window) == WSIZE) endfrag <- endfrag - 1 | |
109 endfrag <- max(1, min(len, endfrag)) | |
110 #print(paste("\t", " endfragnew = ", endfrag, "max of window: ", max(window), "on position", which.max(window))) | |
111 } | |
112 | |
113 #print(paste("Chose diagonal is ", chose_diagonal)) | |
114 | |
115 # If no similarity is found | |
116 if(v == 0){ | |
117 value <- value - count_penalties * penalty | |
118 count_penalties <- count_penalties + 1 | |
119 #print("Got penalty #########") | |
120 | |
121 }else{ | |
122 | |
123 # Similarity is found | |
124 | |
125 if(!chose_diagonal){ | |
126 value <- value + count_penalties * (-sidePenalty) | |
127 count_penalties <- count_penalties + 1 | |
128 #print("Got SIDE penalty @@@ #########") | |
129 }else{ | |
130 count_penalties <- 1 | |
131 value <- value + reward | |
132 #print("Got reward thou #########") | |
133 } | |
134 | |
135 } | |
136 #readline(prompt="Press [enter] to continue") | |
137 | |
138 } | |
139 # Check len of frag | |
140 if(j-i > TH){ | |
141 # HSPS[idx, 1] <- i | |
142 # HSPS[idx, 2] <- pos | |
143 # HSPS[idx, 3] <- j | |
144 # HSPS[idx, 4] <- endfrag | |
145 # HSPS[idx, 5] <- abs(i-j) | |
146 | |
147 HSPS[idx, 1] <- pos | |
148 HSPS[idx, 2] <- i | |
149 HSPS[idx, 3] <- endfrag | |
150 HSPS[idx, 4] <- j | |
151 HSPS[idx, 5] <- abs(i-j) | |
152 | |
153 | |
154 | |
155 #print(paste("ACCEPT", i, pos, j, endfrag, "x-y", j-i, abs(endfrag-pos), sep = " ")) | |
156 idx <- idx + 1 | |
157 }else{ | |
158 #print("REJECT") | |
159 } | |
160 # This will prevent overlappign lines, I think | |
161 #i <- j | |
162 | |
163 if(idx == MAXHSPS) break | |
164 } | |
165 | |
166 | |
167 | |
168 | |
169 return (HSPS) | |
170 } | |
171 | |
172 detect_events <- function(HSPS, sampling){ | |
173 | |
174 DIAG_SEPARATION <- 10 | |
175 # same as HSPS but adding the event | |
176 output <- matrix(0, nrow=length(HSPS[,1]), ncol=1+length(HSPS[1,])) | |
177 colnames(output) <- c("x1", "y1", "x2", "y2", "len", "event") | |
178 j <- 0 | |
179 for(i in 1:(length(HSPS[,1]))){ | |
180 | |
181 if(sum(HSPS[i,]) > 0){ | |
182 j <- j + 1 | |
183 is_inverted = FALSE | |
184 is_diagonal = TRUE | |
185 | |
186 if(HSPS[i,1] > HSPS[i,3]) is_inverted = TRUE | |
187 if(abs(HSPS[i,1] - HSPS[i,2]) > DIAG_SEPARATION && abs(HSPS[i,3] - HSPS[i,4]) > DIAG_SEPARATION) is_diagonal = FALSE | |
188 | |
189 output[i,1] <- HSPS[i,1] * sampling | |
190 output[i,2] <- HSPS[i,2] * sampling | |
191 output[i,3] <- HSPS[i,3] * sampling | |
192 output[i,4] <- HSPS[i,4] * sampling | |
193 output[i,5] <- HSPS[i,5] * sampling | |
194 | |
195 if(is_diagonal) output[i,6] <- "synteny block" | |
196 if(is_diagonal && is_inverted) output[i,6] <- "inversion" | |
197 if(!is_diagonal && !is_inverted) output[i,6] <- "transposition" | |
198 if(!is_diagonal && is_inverted) output[i,6] <-"inverted transposition" | |
199 } | |
200 | |
201 | |
202 } | |
203 | |
204 return (output[1:j,]) | |
205 } | |
206 | |
207 | |
208 | |
209 paint_frags <- function(HSPS, l, sampling){ | |
210 | |
211 plot(c(HSPS[1,1]*sampling, HSPS[1,3]*sampling), c(HSPS[1,2]*sampling, HSPS[1,4]*sampling), xlim = c(1,l*sampling), ylim = c(1,l*sampling), type="l", xlab="X-genome",ylab="Y-genome") | |
212 for(i in 2:length(HSPS[,1])){ | |
213 if(sum(HSPS[i,]) > 0){ | |
214 lines(c(HSPS[i,1]*sampling, HSPS[i,3]*sampling), c(HSPS[i,2]*sampling, HSPS[i,4]*sampling)) | |
215 } | |
216 } | |
217 } | |
218 | |
219 supersample <- function(mat, upscale){ | |
220 l <- min(length(mat[1,]), length(mat[,1])) | |
221 size <- round(l*upscale) | |
222 m <- matrix(0, nrow=size, ncol=size) | |
223 | |
224 for(i in 1:size){ | |
225 for(j in 1:size){ | |
226 mleft <- max(1, i-1) | |
227 mright <- min(l, i+1) | |
228 mup <- max(1, j-1) | |
229 mdown <- min(l, j+1) | |
230 | |
231 ri <- max(1, i/upscale) | |
232 rj <- max(1, j/upscale) | |
233 | |
234 if(mat[ri, rj] > 0){ | |
235 m[i, j] <- 1 | |
236 } | |
237 } | |
238 } | |
239 return (m) | |
240 } | |
241 | |
242 | |
243 downsample <- function(mat, downscale){ | |
244 l <- min(length(mat[1,]), length(mat[,1])) | |
245 size <- round(l/downscale) | |
246 m <- matrix(0, nrow=size, ncol=size) | |
247 | |
248 for(i in 1:l){ | |
249 for(j in 1:l){ | |
250 mup <- max(1, i-1) | |
251 mdown <- min(l, i+1) | |
252 mleft <- max(1, j-1) | |
253 mright <- min(l, j+1) | |
254 | |
255 #print(paste("Matrix at ", i, j)) | |
256 #print(mat[mup:mdown, mleft:mright]) | |
257 | |
258 if(sum(mat[mup:mdown, mleft:mright]) > 0){ | |
259 #print(paste("goes to ", round(i/downscale), round(j/downscale))) | |
260 m[round(i/downscale), round(j/downscale)] <- 1 | |
261 } | |
262 } | |
263 } | |
264 | |
265 return (m) | |
266 } | |
267 | |
268 | |
269 | |
270 | |
271 | |
272 | |
273 | |
274 path_mat = args[1] | |
275 | |
276 | |
277 fancy_name <- strsplit(path_mat, "/") | |
278 fancy_name <- (fancy_name[[1]])[length(fancy_name[[1]])] | |
279 | |
280 # Read sequence lenghts | |
281 con <- file(path_mat,"r") | |
282 seq_lengths <- readLines(con, n=2) | |
283 seq_x_len <- as.numeric(seq_lengths[1]) | |
284 seq_y_len <- as.numeric(seq_lengths[2]) | |
285 close(con) | |
286 | |
287 | |
288 data <- as.matrix(read.csv(path_mat, sep = " ", skip=2)) | |
289 | |
290 # Get sequence names | |
291 name_x <- strsplit(fancy_name, "-")[[1]][1] | |
292 name_y <- strsplit(fancy_name, "-")[[1]][2] | |
293 | |
294 # Max of columns | |
295 | |
296 len_i <- as.numeric(args[2]) | |
297 len_j <- as.numeric(args[2]) | |
298 | |
299 | |
300 score_density <- data | |
301 aux_density <- data | |
302 | |
303 | |
304 pmax_pos <- which.max(aux_density[,1]) | |
305 for(i in 1:len_i){ | |
306 | |
307 cmax_pos <- which.max(aux_density[i,]) # get max of row | |
308 | |
309 if((aux_density[i,cmax_pos]) > 0){ # if it has value | |
310 #row_from_col <- which.max(aux_density[,cmax_pos]) # get max of the column pointed by maximum of row | |
311 score_density[i,] <- 0 # put all others in row to 0 i.e. only use this max, EXCEPT for the maximum in the column | |
312 #score_density[row_from_col, cmax_pos] <- 1 | |
313 score_density[i,cmax_pos] <- 1 | |
314 pmax_pos <- cmax_pos | |
315 } | |
316 } | |
317 | |
318 | |
319 pmax_pos <- which.max(aux_density[1,]) | |
320 | |
321 | |
322 for(i in 1:len_i){ | |
323 | |
324 cmax_pos <- which.max(aux_density[,i]) # get max of column | |
325 | |
326 if((aux_density[cmax_pos,i]) > 0){ | |
327 score_density[,i] <- 0 | |
328 score_density[cmax_pos,i] <- 1 | |
329 pmax_pos <- cmax_pos | |
330 } | |
331 } | |
332 | |
333 | |
334 | |
335 | |
336 | |
337 | |
338 | |
339 | |
340 | |
341 score_copy <- score_density | |
342 diag_len <- 4 | |
343 | |
344 for(i in 6:(len_i-5)){ | |
345 for(j in 6:(len_j-5)){ | |
346 | |
347 value <- 0 | |
348 for(w in (-diag_len/2):(diag_len/2)){ | |
349 if(score_density[i+w,j+w] > 0){ | |
350 value <- value + 1 | |
351 } | |
352 } | |
353 | |
354 if(value >= diag_len){ | |
355 for(k in 1:5){ | |
356 score_copy[i+k,j+k] <- 1 | |
357 } | |
358 for(k in 1:5){ | |
359 score_copy[i-k,j-k] <- 1 | |
360 } | |
361 }else if(score_copy[i,j]==0){ | |
362 score_copy[i,j] <- 0 | |
363 } | |
364 } | |
365 } | |
366 | |
367 for(i in 6:(len_i-5)){ | |
368 for(j in 6:(len_j-5)){ | |
369 | |
370 value <- 0 | |
371 for(w in (-diag_len/2):(diag_len/2)){ | |
372 if(score_density[i-w,j+w] > 0){ | |
373 value <- value + 1 | |
374 } | |
375 } | |
376 | |
377 if(value >= diag_len){ | |
378 for(k in 1:5){ | |
379 score_copy[i-k,j+k] <- 1 | |
380 } | |
381 for(k in 1:5){ | |
382 score_copy[i+k,j-k] <- 1 | |
383 } | |
384 }else if(score_copy[i,j]==0){ | |
385 score_copy[i,j] <- 0 | |
386 } | |
387 } | |
388 } | |
389 | |
390 | |
391 # Kernel to remove single points | |
392 | |
393 for(i in 1:(length(score_copy[,1]))){ | |
394 for(j in 1:(length(score_copy[1,]))){ | |
395 | |
396 value <- 0 | |
397 | |
398 min_i <- max(1, i-1) | |
399 max_i <- min(len_i, i+1) | |
400 min_j <- max(1, j-1) | |
401 max_j <- min(len_j, j+1) | |
402 | |
403 value <- sum(score_copy[min_i:max_i, min_j:max_j]) | |
404 | |
405 if(value < 2) score_copy[i,j] <- 0 | |
406 } | |
407 } | |
408 | |
409 # To compute the score | |
410 score <- 0 | |
411 pmax_pos <- which.max(score_copy[,1]) | |
412 dist_th <- 1.5 | |
413 besti <- 1 | |
414 bestj <- pmax_pos | |
415 dvec1 <- abs(which.max(score_copy[,2]) - which.max(score_copy[,1])) | |
416 dvec2 <- abs(which.max(score_copy[,3]) - which.max(score_copy[,2])) | |
417 dvec3 <- abs(which.max(score_copy[,4]) - which.max(score_copy[,3])) | |
418 for(i in 5:len_i){ | |
419 | |
420 #print(paste(paste(paste(dvec1, dvec2), dvec3), mean(c(dvec1, dvec2, dvec3)))) | |
421 distance <- mean(c(dvec1, dvec2, dvec3)) | |
422 | |
423 | |
424 | |
425 dvec1 <- dvec2 | |
426 dvec2 <- dvec3 | |
427 dvec3 <- abs(which.max(score_copy[,i]) - which.max(score_copy[,i-1])) | |
428 | |
429 # If there is a 0 or we are too far away just add max distance! | |
430 if(distance > dist_th || distance == 0){ | |
431 score <- score + len_i | |
432 } | |
433 | |
434 } | |
435 | |
436 | |
437 score <- (score/(len_i^2)) | |
438 print(score) | |
439 | |
440 | |
441 sampling_value <- 5 | |
442 submat <- downsample(score_copy, sampling_value) | |
443 m <- growing_regions((submat), WSIZE = 7, TH = 5, penalty = 15) | |
444 events <- detect_events(m, sampling_value) | |
445 events <- rbind(events, c(0,0,0,0,0,"Null event")) | |
446 write(as.character(c(seq_x_len, seq_y_len)), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=2) | |
447 write(as.character(c("x1", "y1", "x2", "y2", "len", "event")), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6) | |
448 for(i in 1:length(events[,1])){ | |
449 write(as.character(events[i,]), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6) | |
450 } | |
451 | |
452 | |
453 | |
454 coords1 <- round(seq(from=0, to=1, by=0.2)*seq_x_len) | |
455 coords2 <- round(seq(from=0, to=1, by=0.2)*seq_y_len) | |
456 | |
457 final_image <- apply((t(score_copy)), 2, rev) | |
458 | |
459 png(paste(path_mat, ".filt.png", sep=""), width = length(data[,1]), height = length(data[,1])) | |
460 image(t(final_image), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n', main = paste(fancy_name, paste("filt. score=", score)), xlab = name_x, ylab = name_y, axes = FALSE) | |
461 axis(1, tick = TRUE, labels = (coords1), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1)) | |
462 axis(2, tick = TRUE, labels = rev(coords2), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1)) | |
463 dev.off() | |
464 | |
465 | |
466 # Output pixel coordinates of highly conserved signals | |
467 | |
468 # To clear it in case it existed | |
469 write(c(paste("X", "Y")), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = FALSE, sep = "\n") | |
470 | |
471 | |
472 for(i in 1:(length(score_copy[,1]))){ | |
473 for(j in 1:(length(score_copy[1,]))){ | |
474 if(score_copy[i,j] != 0){ | |
475 write(c(paste(i, j)), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = TRUE, sep = "\n") | |
476 } | |
477 } | |
478 } |