comparison clean_rm_output.R @ 3:e955b40ad3a4 draft default tip

Uploaded
author petr-novak
date Tue, 12 Oct 2021 07:43:54 +0000
parents cf3cea0a3039
children
comparison
equal deleted inserted replaced
2:3f8ae272f4f3 3:e955b40ad3a4
6 gff_disjoin = disjoin(gff, with.revmap=TRUE) 6 gff_disjoin = disjoin(gff, with.revmap=TRUE)
7 ## append annotation: 7 ## append annotation:
8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) 8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) 9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") 10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
11 new_annot_uniq = unique(new_annot)
12 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
13 names(lca_annot) = new_annot_uniq
14 new_annot_lca = lca_annot[new_annot]
15 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
11 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") 16 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
12 gff_disjoin$strands=strand_attribute 17 gff_disjoin$strands=strand_attribute
13 gff_disjoin$source="RM" 18 gff_disjoin$source="RM"
14 gff_disjoin$type="repeat" 19 gff_disjoin$type="repeat"
15 gff_disjoin$score=NA 20 gff_disjoin$score=NA
16 gff_disjoin$phase=NA 21 gff_disjoin$phase=NA
17 gff_disjoin$Name=new_annot 22 gff_disjoin$Name=new_annot_lca
23 gff_disjoin$Original_names=new_annot
18 gff_disjoin$revmap=NULL 24 gff_disjoin$revmap=NULL
19 return(gff_disjoin) 25 return(gff_disjoin)
20 } 26 }
27
28 resolve_name=function(x){
29 if (length(x)==1){
30 # no conflict
31 return(x)
32 } else{
33 y = sapply(x, strsplit, split="/", fixed = TRUE)
34 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x))))
35 if (max(ny)<length(x)){
36 return("Unknown")
37 }else{
38 k = which(ny==length(x))
39 r = max(as.numeric((gsub(" .+","",names(k)))))
40 out = paste(y[[1]][1:r], collapse="/")
41 return(out)
42 }
43 }
44 }
45
46
47
21 infile = commandArgs(T)[1] 48 infile = commandArgs(T)[1]
22 outfile = commandArgs(T)[2] 49 outfile = commandArgs(T)[2]
23 50
24 ## infile = "./test_data/raw_rm.out" 51 ## infile = "./test_data/raw_rm.out"
25 52
42 } 69 }
43 70
44 71
45 ## join neighbors with the same annotation, disregard strand! 72 ## join neighbors with the same annotation, disregard strand!
46 result <- unlist(reduce(split(gff, gff$Name))) 73 result <- unlist(reduce(split(gff, gff$Name)))
74
47 result$Name <- names(result) 75 result$Name <- names(result)
76
77 result_clean = gff_cleanup(result)
48 78
49 ## TODO 79 ## TODO
50 ## identify conflicting annotation, replace by LCA but keep origin list of classifications 80 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
51 81
52 gff_out = sortSeqlevels(result) 82 gff_out = sortSeqlevels(result_clean)
53 gff_out = sort(gff_out) 83 gff_out = sort(gff_out)
54 gff_out$type = "repeat_region" 84 gff_out$type = "repeat_region"
55 gff_out$source = "RepeatMasker_parsed" 85 gff_out$source = "RepeatMasker_parsed"
56 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) 86 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name))
57 export(gff_out, format = "gff3", con=outfile) 87 export(gff_out, format = "gff3", con=outfile)