0
|
1
|
|
2 #!/usr/bin/env Rscript
|
|
3 args = commandArgs(trailingOnly=TRUE)
|
|
4
|
|
5 if(length(args) < 2){
|
|
6 stop("USE: Rscript --vanilla plot.R <matrix> <matsize>")
|
|
7 }
|
|
8
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13
|
|
14 growing_regions <- function(mat, reward = 6, penalty = 5, sidePenalty = 3, MAXHSPS = 500, TH = 10, WSIZE = 7){
|
|
15
|
|
16 #write(mat, file ="data.txt", ncolumns = 200)
|
|
17
|
|
18 len <- min(length(mat[1,]), length(mat[,1]))
|
|
19 HSPS <- matrix(0, nrow=MAXHSPS, ncol=5)
|
|
20 idx <- 1
|
|
21 lH <- round(WSIZE/2) - 1
|
|
22 rH <- round(WSIZE/2) + 1
|
|
23
|
|
24
|
|
25 #print(paste("lH: ", lH, "rH: ", rH))
|
|
26
|
|
27 if(WSIZE %% 2 == 0){
|
|
28 print("WSIZE MUST BE ODD")
|
|
29 stop()
|
|
30 }
|
|
31
|
|
32
|
|
33
|
|
34 i <- 1
|
|
35 #readline(prompt="Press [enter] to continue")
|
|
36 while(i < len){
|
|
37
|
|
38 value <- max(mat[i,]) * reward
|
|
39 if(value == 0) i <- i + 1
|
|
40 pos <- which.max(mat[i,])
|
|
41 # these two hold ending frag
|
|
42 endfrag <- pos
|
|
43 j <- i
|
|
44 count_penalties <- 1
|
|
45
|
|
46 #print("-----------------")
|
|
47
|
|
48 while(value > 0 && j < len){
|
|
49
|
|
50 #print(paste("took values ", j, endfrag, "which have score of ", mat[j, endfrag], "current value is: ", value))
|
|
51
|
|
52
|
|
53
|
|
54
|
|
55 # Reset position used
|
|
56
|
|
57 mat[max(1,j-1), max(1,endfrag-2)] <- 0
|
|
58 mat[max(1,j-1), max(1,endfrag-1)] <- 0
|
|
59 mat[max(1,j-1), endfrag] <- 0
|
|
60 mat[max(1,j-1), min(len, endfrag+1)] <- 0
|
|
61 mat[max(1,j-1), min(len, endfrag+2)] <- 0
|
|
62
|
|
63 mat[j, max(1,endfrag-2)] <- 0
|
|
64 mat[j, max(1,endfrag-1)] <- 0
|
|
65 mat[j, endfrag] <- 0
|
|
66 mat[j, min(len, endfrag+1)] <- 0
|
|
67 mat[j, min(len, endfrag+2)] <- 0
|
|
68
|
|
69 #print(paste("Erasing: (",max(1,j-1), max(1,endfrag-2),")(",max(1,j-1), max(1,endfrag-1),")(",max(1,j-1), endfrag,
|
|
70 # ")(",max(1,j-1), min(len, endfrag+1),")(",max(1,j-1), min(len, endfrag+2),")(",j, max(1,endfrag-2),
|
|
71 # ")(",j, max(1,endfrag-1),")(",j, endfrag,")(",j, min(len, endfrag+1),")(",j, min(len, endfrag+2),")"))
|
|
72
|
|
73
|
|
74 # Go for next
|
|
75 j <- j + 1
|
|
76 # Check next, could be reverse or forward
|
|
77 mleft <- max(1, endfrag-lH)
|
|
78 mright <- min(len, endfrag+lH)
|
|
79 window <- mat[j, mleft:mright]
|
|
80
|
|
81
|
|
82 #print(paste("mleft: ", mleft, "mright", mright, "j is: ", j))
|
|
83 #print(window)
|
|
84
|
|
85 v <- max(window)
|
|
86 selected <- which.max(window)
|
|
87 # Make it rather go diagonally
|
|
88 #print(paste("WIndow len: ", length(window)))
|
|
89 chose_diagonal <- FALSE
|
|
90 if(length(window) == WSIZE && v == window[lH]){
|
|
91 selected <- lH
|
|
92 chose_diagonal <- TRUE
|
|
93 }
|
|
94
|
|
95 if(length(window) == WSIZE && v == window[rH]){
|
|
96 selected <- rH
|
|
97 chose_diagonal <- TRUE
|
|
98 }
|
|
99
|
|
100 #print(paste("Selected value be like ", selected))
|
|
101
|
|
102
|
|
103 if(v != 0){
|
|
104
|
|
105
|
|
106
|
|
107 endfrag <- (mleft + selected ) # To make the indexing
|
|
108 if(length(window) == WSIZE) endfrag <- endfrag - 1
|
|
109 endfrag <- max(1, min(len, endfrag))
|
|
110 #print(paste("\t", " endfragnew = ", endfrag, "max of window: ", max(window), "on position", which.max(window)))
|
|
111 }
|
|
112
|
|
113 #print(paste("Chose diagonal is ", chose_diagonal))
|
|
114
|
|
115 # If no similarity is found
|
|
116 if(v == 0){
|
|
117 value <- value - count_penalties * penalty
|
|
118 count_penalties <- count_penalties + 1
|
|
119 #print("Got penalty #########")
|
|
120
|
|
121 }else{
|
|
122
|
|
123 # Similarity is found
|
|
124
|
|
125 if(!chose_diagonal){
|
|
126 value <- value + count_penalties * (-sidePenalty)
|
|
127 count_penalties <- count_penalties + 1
|
|
128 #print("Got SIDE penalty @@@ #########")
|
|
129 }else{
|
|
130 count_penalties <- 1
|
|
131 value <- value + reward
|
|
132 #print("Got reward thou #########")
|
|
133 }
|
|
134
|
|
135 }
|
|
136 #readline(prompt="Press [enter] to continue")
|
|
137
|
|
138 }
|
|
139 # Check len of frag
|
|
140 if(j-i > TH){
|
|
141 # HSPS[idx, 1] <- i
|
|
142 # HSPS[idx, 2] <- pos
|
|
143 # HSPS[idx, 3] <- j
|
|
144 # HSPS[idx, 4] <- endfrag
|
|
145 # HSPS[idx, 5] <- abs(i-j)
|
|
146
|
|
147 HSPS[idx, 1] <- pos
|
|
148 HSPS[idx, 2] <- i
|
|
149 HSPS[idx, 3] <- endfrag
|
|
150 HSPS[idx, 4] <- j
|
|
151 HSPS[idx, 5] <- abs(i-j)
|
|
152
|
|
153
|
|
154
|
|
155 #print(paste("ACCEPT", i, pos, j, endfrag, "x-y", j-i, abs(endfrag-pos), sep = " "))
|
|
156 idx <- idx + 1
|
|
157 }else{
|
|
158 #print("REJECT")
|
|
159 }
|
|
160 # This will prevent overlappign lines, I think
|
|
161 #i <- j
|
|
162
|
|
163 if(idx == MAXHSPS) break
|
|
164 }
|
|
165
|
|
166
|
|
167
|
|
168
|
|
169 return (HSPS)
|
|
170 }
|
|
171
|
|
172 detect_events <- function(HSPS, sampling){
|
|
173
|
|
174 DIAG_SEPARATION <- 10
|
|
175 # same as HSPS but adding the event
|
|
176 output <- matrix(0, nrow=length(HSPS[,1]), ncol=1+length(HSPS[1,]))
|
|
177 colnames(output) <- c("x1", "y1", "x2", "y2", "len", "event")
|
|
178 j <- 0
|
|
179 for(i in 1:(length(HSPS[,1]))){
|
|
180
|
|
181 if(sum(HSPS[i,]) > 0){
|
|
182 j <- j + 1
|
|
183 is_inverted = FALSE
|
|
184 is_diagonal = TRUE
|
|
185
|
|
186 if(HSPS[i,1] > HSPS[i,3]) is_inverted = TRUE
|
|
187 if(abs(HSPS[i,1] - HSPS[i,2]) > DIAG_SEPARATION && abs(HSPS[i,3] - HSPS[i,4]) > DIAG_SEPARATION) is_diagonal = FALSE
|
|
188
|
|
189 output[i,1] <- HSPS[i,1] * sampling
|
|
190 output[i,2] <- HSPS[i,2] * sampling
|
|
191 output[i,3] <- HSPS[i,3] * sampling
|
|
192 output[i,4] <- HSPS[i,4] * sampling
|
|
193 output[i,5] <- HSPS[i,5] * sampling
|
|
194
|
|
195 if(is_diagonal) output[i,6] <- "synteny block"
|
|
196 if(is_diagonal && is_inverted) output[i,6] <- "inversion"
|
|
197 if(!is_diagonal && !is_inverted) output[i,6] <- "transposition"
|
|
198 if(!is_diagonal && is_inverted) output[i,6] <-"inverted transposition"
|
|
199 }
|
|
200
|
|
201
|
|
202 }
|
|
203
|
|
204 return (output[1:j,])
|
|
205 }
|
|
206
|
|
207
|
|
208
|
|
209 paint_frags <- function(HSPS, l, sampling){
|
|
210
|
|
211 plot(c(HSPS[1,1]*sampling, HSPS[1,3]*sampling), c(HSPS[1,2]*sampling, HSPS[1,4]*sampling), xlim = c(1,l*sampling), ylim = c(1,l*sampling), type="l", xlab="X-genome",ylab="Y-genome")
|
|
212 for(i in 2:length(HSPS[,1])){
|
|
213 if(sum(HSPS[i,]) > 0){
|
|
214 lines(c(HSPS[i,1]*sampling, HSPS[i,3]*sampling), c(HSPS[i,2]*sampling, HSPS[i,4]*sampling))
|
|
215 }
|
|
216 }
|
|
217 }
|
|
218
|
|
219 supersample <- function(mat, upscale){
|
|
220 l <- min(length(mat[1,]), length(mat[,1]))
|
|
221 size <- round(l*upscale)
|
|
222 m <- matrix(0, nrow=size, ncol=size)
|
|
223
|
|
224 for(i in 1:size){
|
|
225 for(j in 1:size){
|
|
226 mleft <- max(1, i-1)
|
|
227 mright <- min(l, i+1)
|
|
228 mup <- max(1, j-1)
|
|
229 mdown <- min(l, j+1)
|
|
230
|
|
231 ri <- max(1, i/upscale)
|
|
232 rj <- max(1, j/upscale)
|
|
233
|
|
234 if(mat[ri, rj] > 0){
|
|
235 m[i, j] <- 1
|
|
236 }
|
|
237 }
|
|
238 }
|
|
239 return (m)
|
|
240 }
|
|
241
|
|
242
|
|
243 downsample <- function(mat, downscale){
|
|
244 l <- min(length(mat[1,]), length(mat[,1]))
|
|
245 size <- round(l/downscale)
|
|
246 m <- matrix(0, nrow=size, ncol=size)
|
|
247
|
|
248 for(i in 1:l){
|
|
249 for(j in 1:l){
|
|
250 mup <- max(1, i-1)
|
|
251 mdown <- min(l, i+1)
|
|
252 mleft <- max(1, j-1)
|
|
253 mright <- min(l, j+1)
|
|
254
|
|
255 #print(paste("Matrix at ", i, j))
|
|
256 #print(mat[mup:mdown, mleft:mright])
|
|
257
|
|
258 if(sum(mat[mup:mdown, mleft:mright]) > 0){
|
|
259 #print(paste("goes to ", round(i/downscale), round(j/downscale)))
|
|
260 m[round(i/downscale), round(j/downscale)] <- 1
|
|
261 }
|
|
262 }
|
|
263 }
|
|
264
|
|
265 return (m)
|
|
266 }
|
|
267
|
|
268
|
|
269
|
|
270
|
|
271
|
|
272
|
|
273
|
|
274 path_mat = args[1]
|
|
275
|
|
276
|
|
277 fancy_name <- strsplit(path_mat, "/")
|
|
278 fancy_name <- (fancy_name[[1]])[length(fancy_name[[1]])]
|
|
279
|
|
280 # Read sequence lenghts
|
|
281 con <- file(path_mat,"r")
|
|
282 seq_lengths <- readLines(con, n=2)
|
|
283 seq_x_len <- as.numeric(seq_lengths[1])
|
|
284 seq_y_len <- as.numeric(seq_lengths[2])
|
|
285 close(con)
|
|
286
|
|
287
|
|
288 data <- as.matrix(read.csv(path_mat, sep = " ", skip=2))
|
|
289
|
|
290 # Get sequence names
|
|
291 name_x <- strsplit(fancy_name, "-")[[1]][1]
|
|
292 name_y <- strsplit(fancy_name, "-")[[1]][2]
|
|
293
|
|
294 # Max of columns
|
|
295
|
|
296 len_i <- as.numeric(args[2])
|
|
297 len_j <- as.numeric(args[2])
|
|
298
|
|
299
|
|
300 score_density <- data
|
|
301 aux_density <- data
|
|
302
|
|
303
|
|
304 pmax_pos <- which.max(aux_density[,1])
|
|
305 for(i in 1:len_i){
|
|
306
|
|
307 cmax_pos <- which.max(aux_density[i,]) # get max of row
|
|
308
|
|
309 if((aux_density[i,cmax_pos]) > 0){ # if it has value
|
|
310 #row_from_col <- which.max(aux_density[,cmax_pos]) # get max of the column pointed by maximum of row
|
|
311 score_density[i,] <- 0 # put all others in row to 0 i.e. only use this max, EXCEPT for the maximum in the column
|
|
312 #score_density[row_from_col, cmax_pos] <- 1
|
|
313 score_density[i,cmax_pos] <- 1
|
|
314 pmax_pos <- cmax_pos
|
|
315 }
|
|
316 }
|
|
317
|
|
318
|
|
319 pmax_pos <- which.max(aux_density[1,])
|
|
320
|
|
321
|
|
322 for(i in 1:len_i){
|
|
323
|
|
324 cmax_pos <- which.max(aux_density[,i]) # get max of column
|
|
325
|
|
326 if((aux_density[cmax_pos,i]) > 0){
|
|
327 score_density[,i] <- 0
|
|
328 score_density[cmax_pos,i] <- 1
|
|
329 pmax_pos <- cmax_pos
|
|
330 }
|
|
331 }
|
|
332
|
|
333
|
|
334
|
|
335
|
|
336
|
|
337
|
|
338
|
|
339
|
|
340
|
|
341 score_copy <- score_density
|
|
342 diag_len <- 4
|
|
343
|
|
344 for(i in 6:(len_i-5)){
|
|
345 for(j in 6:(len_j-5)){
|
|
346
|
|
347 value <- 0
|
|
348 for(w in (-diag_len/2):(diag_len/2)){
|
|
349 if(score_density[i+w,j+w] > 0){
|
|
350 value <- value + 1
|
|
351 }
|
|
352 }
|
|
353
|
|
354 if(value >= diag_len){
|
|
355 for(k in 1:5){
|
|
356 score_copy[i+k,j+k] <- 1
|
|
357 }
|
|
358 for(k in 1:5){
|
|
359 score_copy[i-k,j-k] <- 1
|
|
360 }
|
|
361 }else if(score_copy[i,j]==0){
|
|
362 score_copy[i,j] <- 0
|
|
363 }
|
|
364 }
|
|
365 }
|
|
366
|
|
367 for(i in 6:(len_i-5)){
|
|
368 for(j in 6:(len_j-5)){
|
|
369
|
|
370 value <- 0
|
|
371 for(w in (-diag_len/2):(diag_len/2)){
|
|
372 if(score_density[i-w,j+w] > 0){
|
|
373 value <- value + 1
|
|
374 }
|
|
375 }
|
|
376
|
|
377 if(value >= diag_len){
|
|
378 for(k in 1:5){
|
|
379 score_copy[i-k,j+k] <- 1
|
|
380 }
|
|
381 for(k in 1:5){
|
|
382 score_copy[i+k,j-k] <- 1
|
|
383 }
|
|
384 }else if(score_copy[i,j]==0){
|
|
385 score_copy[i,j] <- 0
|
|
386 }
|
|
387 }
|
|
388 }
|
|
389
|
|
390
|
|
391 # Kernel to remove single points
|
|
392
|
|
393 for(i in 1:(length(score_copy[,1]))){
|
|
394 for(j in 1:(length(score_copy[1,]))){
|
|
395
|
|
396 value <- 0
|
|
397
|
|
398 min_i <- max(1, i-1)
|
|
399 max_i <- min(len_i, i+1)
|
|
400 min_j <- max(1, j-1)
|
|
401 max_j <- min(len_j, j+1)
|
|
402
|
|
403 value <- sum(score_copy[min_i:max_i, min_j:max_j])
|
|
404
|
|
405 if(value < 2) score_copy[i,j] <- 0
|
|
406 }
|
|
407 }
|
|
408
|
|
409 # To compute the score
|
|
410 score <- 0
|
|
411 pmax_pos <- which.max(score_copy[,1])
|
|
412 dist_th <- 1.5
|
|
413 besti <- 1
|
|
414 bestj <- pmax_pos
|
|
415 dvec1 <- abs(which.max(score_copy[,2]) - which.max(score_copy[,1]))
|
|
416 dvec2 <- abs(which.max(score_copy[,3]) - which.max(score_copy[,2]))
|
|
417 dvec3 <- abs(which.max(score_copy[,4]) - which.max(score_copy[,3]))
|
|
418 for(i in 5:len_i){
|
|
419
|
|
420 #print(paste(paste(paste(dvec1, dvec2), dvec3), mean(c(dvec1, dvec2, dvec3))))
|
|
421 distance <- mean(c(dvec1, dvec2, dvec3))
|
|
422
|
|
423
|
|
424
|
|
425 dvec1 <- dvec2
|
|
426 dvec2 <- dvec3
|
|
427 dvec3 <- abs(which.max(score_copy[,i]) - which.max(score_copy[,i-1]))
|
|
428
|
|
429 # If there is a 0 or we are too far away just add max distance!
|
|
430 if(distance > dist_th || distance == 0){
|
|
431 score <- score + len_i
|
|
432 }
|
|
433
|
|
434 }
|
|
435
|
|
436
|
|
437 score <- (score/(len_i^2))
|
|
438 print(score)
|
|
439
|
|
440
|
|
441 sampling_value <- 5
|
|
442 submat <- downsample(score_copy, sampling_value)
|
|
443 m <- growing_regions((submat), WSIZE = 7, TH = 5, penalty = 15)
|
|
444 events <- detect_events(m, sampling_value)
|
|
445 events <- rbind(events, c(0,0,0,0,0,"Null event"))
|
|
446 write(as.character(c(seq_x_len, seq_y_len)), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=2)
|
|
447 write(as.character(c("x1", "y1", "x2", "y2", "len", "event")), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6)
|
|
448 for(i in 1:length(events[,1])){
|
|
449 write(as.character(events[i,]), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6)
|
|
450 }
|
|
451
|
|
452
|
|
453
|
|
454 coords1 <- round(seq(from=0, to=1, by=0.2)*seq_x_len)
|
|
455 coords2 <- round(seq(from=0, to=1, by=0.2)*seq_y_len)
|
|
456
|
|
457 final_image <- apply((t(score_copy)), 2, rev)
|
|
458
|
|
459 png(paste(path_mat, ".filt.png", sep=""), width = length(data[,1]), height = length(data[,1]))
|
|
460 image(t(final_image), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n', main = paste(fancy_name, paste("filt. score=", score)), xlab = name_x, ylab = name_y, axes = FALSE)
|
|
461 axis(1, tick = TRUE, labels = (coords1), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1))
|
|
462 axis(2, tick = TRUE, labels = rev(coords2), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1))
|
|
463 dev.off()
|
|
464
|
|
465
|
|
466 # Output pixel coordinates of highly conserved signals
|
|
467
|
|
468 # To clear it in case it existed
|
|
469 write(c(paste("X", "Y")), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = FALSE, sep = "\n")
|
|
470
|
|
471
|
|
472 for(i in 1:(length(score_copy[,1]))){
|
|
473 for(j in 1:(length(score_copy[1,]))){
|
|
474 if(score_copy[i,j] != 0){
|
|
475 write(c(paste(i, j)), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = TRUE, sep = "\n")
|
|
476 }
|
|
477 }
|
|
478 }
|