comparison chromeister/bin/compute_score.R @ 0:7fdf47a0bae8 draft

Uploaded
author alvarofaure
date Wed, 12 Dec 2018 07:18:40 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7fdf47a0bae8
1
2 #!/usr/bin/env Rscript
3 args = commandArgs(trailingOnly=TRUE)
4
5 if(length(args) < 2){
6 stop("USE: Rscript --vanilla plot.R <matrix> <matsize>")
7 }
8
9
10
11
12
13
14 growing_regions <- function(mat, reward = 6, penalty = 5, sidePenalty = 3, MAXHSPS = 500, TH = 10, WSIZE = 7){
15
16 #write(mat, file ="data.txt", ncolumns = 200)
17
18 len <- min(length(mat[1,]), length(mat[,1]))
19 HSPS <- matrix(0, nrow=MAXHSPS, ncol=5)
20 idx <- 1
21 lH <- round(WSIZE/2) - 1
22 rH <- round(WSIZE/2) + 1
23
24
25 #print(paste("lH: ", lH, "rH: ", rH))
26
27 if(WSIZE %% 2 == 0){
28 print("WSIZE MUST BE ODD")
29 stop()
30 }
31
32
33
34 i <- 1
35 #readline(prompt="Press [enter] to continue")
36 while(i < len){
37
38 value <- max(mat[i,]) * reward
39 if(value == 0) i <- i + 1
40 pos <- which.max(mat[i,])
41 # these two hold ending frag
42 endfrag <- pos
43 j <- i
44 count_penalties <- 1
45
46 #print("-----------------")
47
48 while(value > 0 && j < len){
49
50 #print(paste("took values ", j, endfrag, "which have score of ", mat[j, endfrag], "current value is: ", value))
51
52
53
54
55 # Reset position used
56
57 mat[max(1,j-1), max(1,endfrag-2)] <- 0
58 mat[max(1,j-1), max(1,endfrag-1)] <- 0
59 mat[max(1,j-1), endfrag] <- 0
60 mat[max(1,j-1), min(len, endfrag+1)] <- 0
61 mat[max(1,j-1), min(len, endfrag+2)] <- 0
62
63 mat[j, max(1,endfrag-2)] <- 0
64 mat[j, max(1,endfrag-1)] <- 0
65 mat[j, endfrag] <- 0
66 mat[j, min(len, endfrag+1)] <- 0
67 mat[j, min(len, endfrag+2)] <- 0
68
69 #print(paste("Erasing: (",max(1,j-1), max(1,endfrag-2),")(",max(1,j-1), max(1,endfrag-1),")(",max(1,j-1), endfrag,
70 # ")(",max(1,j-1), min(len, endfrag+1),")(",max(1,j-1), min(len, endfrag+2),")(",j, max(1,endfrag-2),
71 # ")(",j, max(1,endfrag-1),")(",j, endfrag,")(",j, min(len, endfrag+1),")(",j, min(len, endfrag+2),")"))
72
73
74 # Go for next
75 j <- j + 1
76 # Check next, could be reverse or forward
77 mleft <- max(1, endfrag-lH)
78 mright <- min(len, endfrag+lH)
79 window <- mat[j, mleft:mright]
80
81
82 #print(paste("mleft: ", mleft, "mright", mright, "j is: ", j))
83 #print(window)
84
85 v <- max(window)
86 selected <- which.max(window)
87 # Make it rather go diagonally
88 #print(paste("WIndow len: ", length(window)))
89 chose_diagonal <- FALSE
90 if(length(window) == WSIZE && v == window[lH]){
91 selected <- lH
92 chose_diagonal <- TRUE
93 }
94
95 if(length(window) == WSIZE && v == window[rH]){
96 selected <- rH
97 chose_diagonal <- TRUE
98 }
99
100 #print(paste("Selected value be like ", selected))
101
102
103 if(v != 0){
104
105
106
107 endfrag <- (mleft + selected ) # To make the indexing
108 if(length(window) == WSIZE) endfrag <- endfrag - 1
109 endfrag <- max(1, min(len, endfrag))
110 #print(paste("\t", " endfragnew = ", endfrag, "max of window: ", max(window), "on position", which.max(window)))
111 }
112
113 #print(paste("Chose diagonal is ", chose_diagonal))
114
115 # If no similarity is found
116 if(v == 0){
117 value <- value - count_penalties * penalty
118 count_penalties <- count_penalties + 1
119 #print("Got penalty #########")
120
121 }else{
122
123 # Similarity is found
124
125 if(!chose_diagonal){
126 value <- value + count_penalties * (-sidePenalty)
127 count_penalties <- count_penalties + 1
128 #print("Got SIDE penalty @@@ #########")
129 }else{
130 count_penalties <- 1
131 value <- value + reward
132 #print("Got reward thou #########")
133 }
134
135 }
136 #readline(prompt="Press [enter] to continue")
137
138 }
139 # Check len of frag
140 if(j-i > TH){
141 # HSPS[idx, 1] <- i
142 # HSPS[idx, 2] <- pos
143 # HSPS[idx, 3] <- j
144 # HSPS[idx, 4] <- endfrag
145 # HSPS[idx, 5] <- abs(i-j)
146
147 HSPS[idx, 1] <- pos
148 HSPS[idx, 2] <- i
149 HSPS[idx, 3] <- endfrag
150 HSPS[idx, 4] <- j
151 HSPS[idx, 5] <- abs(i-j)
152
153
154
155 #print(paste("ACCEPT", i, pos, j, endfrag, "x-y", j-i, abs(endfrag-pos), sep = " "))
156 idx <- idx + 1
157 }else{
158 #print("REJECT")
159 }
160 # This will prevent overlappign lines, I think
161 #i <- j
162
163 if(idx == MAXHSPS) break
164 }
165
166
167
168
169 return (HSPS)
170 }
171
172 detect_events <- function(HSPS, sampling){
173
174 DIAG_SEPARATION <- 10
175 # same as HSPS but adding the event
176 output <- matrix(0, nrow=length(HSPS[,1]), ncol=1+length(HSPS[1,]))
177 colnames(output) <- c("x1", "y1", "x2", "y2", "len", "event")
178 j <- 0
179 for(i in 1:(length(HSPS[,1]))){
180
181 if(sum(HSPS[i,]) > 0){
182 j <- j + 1
183 is_inverted = FALSE
184 is_diagonal = TRUE
185
186 if(HSPS[i,1] > HSPS[i,3]) is_inverted = TRUE
187 if(abs(HSPS[i,1] - HSPS[i,2]) > DIAG_SEPARATION && abs(HSPS[i,3] - HSPS[i,4]) > DIAG_SEPARATION) is_diagonal = FALSE
188
189 output[i,1] <- HSPS[i,1] * sampling
190 output[i,2] <- HSPS[i,2] * sampling
191 output[i,3] <- HSPS[i,3] * sampling
192 output[i,4] <- HSPS[i,4] * sampling
193 output[i,5] <- HSPS[i,5] * sampling
194
195 if(is_diagonal) output[i,6] <- "synteny block"
196 if(is_diagonal && is_inverted) output[i,6] <- "inversion"
197 if(!is_diagonal && !is_inverted) output[i,6] <- "transposition"
198 if(!is_diagonal && is_inverted) output[i,6] <-"inverted transposition"
199 }
200
201
202 }
203
204 return (output[1:j,])
205 }
206
207
208
209 paint_frags <- function(HSPS, l, sampling){
210
211 plot(c(HSPS[1,1]*sampling, HSPS[1,3]*sampling), c(HSPS[1,2]*sampling, HSPS[1,4]*sampling), xlim = c(1,l*sampling), ylim = c(1,l*sampling), type="l", xlab="X-genome",ylab="Y-genome")
212 for(i in 2:length(HSPS[,1])){
213 if(sum(HSPS[i,]) > 0){
214 lines(c(HSPS[i,1]*sampling, HSPS[i,3]*sampling), c(HSPS[i,2]*sampling, HSPS[i,4]*sampling))
215 }
216 }
217 }
218
219 supersample <- function(mat, upscale){
220 l <- min(length(mat[1,]), length(mat[,1]))
221 size <- round(l*upscale)
222 m <- matrix(0, nrow=size, ncol=size)
223
224 for(i in 1:size){
225 for(j in 1:size){
226 mleft <- max(1, i-1)
227 mright <- min(l, i+1)
228 mup <- max(1, j-1)
229 mdown <- min(l, j+1)
230
231 ri <- max(1, i/upscale)
232 rj <- max(1, j/upscale)
233
234 if(mat[ri, rj] > 0){
235 m[i, j] <- 1
236 }
237 }
238 }
239 return (m)
240 }
241
242
243 downsample <- function(mat, downscale){
244 l <- min(length(mat[1,]), length(mat[,1]))
245 size <- round(l/downscale)
246 m <- matrix(0, nrow=size, ncol=size)
247
248 for(i in 1:l){
249 for(j in 1:l){
250 mup <- max(1, i-1)
251 mdown <- min(l, i+1)
252 mleft <- max(1, j-1)
253 mright <- min(l, j+1)
254
255 #print(paste("Matrix at ", i, j))
256 #print(mat[mup:mdown, mleft:mright])
257
258 if(sum(mat[mup:mdown, mleft:mright]) > 0){
259 #print(paste("goes to ", round(i/downscale), round(j/downscale)))
260 m[round(i/downscale), round(j/downscale)] <- 1
261 }
262 }
263 }
264
265 return (m)
266 }
267
268
269
270
271
272
273
274 path_mat = args[1]
275
276
277 fancy_name <- strsplit(path_mat, "/")
278 fancy_name <- (fancy_name[[1]])[length(fancy_name[[1]])]
279
280 # Read sequence lenghts
281 con <- file(path_mat,"r")
282 seq_lengths <- readLines(con, n=2)
283 seq_x_len <- as.numeric(seq_lengths[1])
284 seq_y_len <- as.numeric(seq_lengths[2])
285 close(con)
286
287
288 data <- as.matrix(read.csv(path_mat, sep = " ", skip=2))
289
290 # Get sequence names
291 name_x <- strsplit(fancy_name, "-")[[1]][1]
292 name_y <- strsplit(fancy_name, "-")[[1]][2]
293
294 # Max of columns
295
296 len_i <- as.numeric(args[2])
297 len_j <- as.numeric(args[2])
298
299
300 score_density <- data
301 aux_density <- data
302
303
304 pmax_pos <- which.max(aux_density[,1])
305 for(i in 1:len_i){
306
307 cmax_pos <- which.max(aux_density[i,]) # get max of row
308
309 if((aux_density[i,cmax_pos]) > 0){ # if it has value
310 #row_from_col <- which.max(aux_density[,cmax_pos]) # get max of the column pointed by maximum of row
311 score_density[i,] <- 0 # put all others in row to 0 i.e. only use this max, EXCEPT for the maximum in the column
312 #score_density[row_from_col, cmax_pos] <- 1
313 score_density[i,cmax_pos] <- 1
314 pmax_pos <- cmax_pos
315 }
316 }
317
318
319 pmax_pos <- which.max(aux_density[1,])
320
321
322 for(i in 1:len_i){
323
324 cmax_pos <- which.max(aux_density[,i]) # get max of column
325
326 if((aux_density[cmax_pos,i]) > 0){
327 score_density[,i] <- 0
328 score_density[cmax_pos,i] <- 1
329 pmax_pos <- cmax_pos
330 }
331 }
332
333
334
335
336
337
338
339
340
341 score_copy <- score_density
342 diag_len <- 4
343
344 for(i in 6:(len_i-5)){
345 for(j in 6:(len_j-5)){
346
347 value <- 0
348 for(w in (-diag_len/2):(diag_len/2)){
349 if(score_density[i+w,j+w] > 0){
350 value <- value + 1
351 }
352 }
353
354 if(value >= diag_len){
355 for(k in 1:5){
356 score_copy[i+k,j+k] <- 1
357 }
358 for(k in 1:5){
359 score_copy[i-k,j-k] <- 1
360 }
361 }else if(score_copy[i,j]==0){
362 score_copy[i,j] <- 0
363 }
364 }
365 }
366
367 for(i in 6:(len_i-5)){
368 for(j in 6:(len_j-5)){
369
370 value <- 0
371 for(w in (-diag_len/2):(diag_len/2)){
372 if(score_density[i-w,j+w] > 0){
373 value <- value + 1
374 }
375 }
376
377 if(value >= diag_len){
378 for(k in 1:5){
379 score_copy[i-k,j+k] <- 1
380 }
381 for(k in 1:5){
382 score_copy[i+k,j-k] <- 1
383 }
384 }else if(score_copy[i,j]==0){
385 score_copy[i,j] <- 0
386 }
387 }
388 }
389
390
391 # Kernel to remove single points
392
393 for(i in 1:(length(score_copy[,1]))){
394 for(j in 1:(length(score_copy[1,]))){
395
396 value <- 0
397
398 min_i <- max(1, i-1)
399 max_i <- min(len_i, i+1)
400 min_j <- max(1, j-1)
401 max_j <- min(len_j, j+1)
402
403 value <- sum(score_copy[min_i:max_i, min_j:max_j])
404
405 if(value < 2) score_copy[i,j] <- 0
406 }
407 }
408
409 # To compute the score
410 score <- 0
411 pmax_pos <- which.max(score_copy[,1])
412 dist_th <- 1.5
413 besti <- 1
414 bestj <- pmax_pos
415 dvec1 <- abs(which.max(score_copy[,2]) - which.max(score_copy[,1]))
416 dvec2 <- abs(which.max(score_copy[,3]) - which.max(score_copy[,2]))
417 dvec3 <- abs(which.max(score_copy[,4]) - which.max(score_copy[,3]))
418 for(i in 5:len_i){
419
420 #print(paste(paste(paste(dvec1, dvec2), dvec3), mean(c(dvec1, dvec2, dvec3))))
421 distance <- mean(c(dvec1, dvec2, dvec3))
422
423
424
425 dvec1 <- dvec2
426 dvec2 <- dvec3
427 dvec3 <- abs(which.max(score_copy[,i]) - which.max(score_copy[,i-1]))
428
429 # If there is a 0 or we are too far away just add max distance!
430 if(distance > dist_th || distance == 0){
431 score <- score + len_i
432 }
433
434 }
435
436
437 score <- (score/(len_i^2))
438 print(score)
439
440
441 sampling_value <- 5
442 submat <- downsample(score_copy, sampling_value)
443 m <- growing_regions((submat), WSIZE = 7, TH = 5, penalty = 15)
444 events <- detect_events(m, sampling_value)
445 events <- rbind(events, c(0,0,0,0,0,"Null event"))
446 write(as.character(c(seq_x_len, seq_y_len)), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=2)
447 write(as.character(c("x1", "y1", "x2", "y2", "len", "event")), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6)
448 for(i in 1:length(events[,1])){
449 write(as.character(events[i,]), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6)
450 }
451
452
453
454 coords1 <- round(seq(from=0, to=1, by=0.2)*seq_x_len)
455 coords2 <- round(seq(from=0, to=1, by=0.2)*seq_y_len)
456
457 final_image <- apply((t(score_copy)), 2, rev)
458
459 png(paste(path_mat, ".filt.png", sep=""), width = length(data[,1]), height = length(data[,1]))
460 image(t(final_image), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n', main = paste(fancy_name, paste("filt. score=", score)), xlab = name_x, ylab = name_y, axes = FALSE)
461 axis(1, tick = TRUE, labels = (coords1), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1))
462 axis(2, tick = TRUE, labels = rev(coords2), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1))
463 dev.off()
464
465
466 # Output pixel coordinates of highly conserved signals
467
468 # To clear it in case it existed
469 write(c(paste("X", "Y")), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = FALSE, sep = "\n")
470
471
472 for(i in 1:(length(score_copy[,1]))){
473 for(j in 1:(length(score_copy[1,]))){
474 if(score_copy[i,j] != 0){
475 write(c(paste(i, j)), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = TRUE, sep = "\n")
476 }
477 }
478 }