0
|
1 #!/usr/bin/env Rscript
|
|
2 suppressPackageStartupMessages(library(rtracklayer))
|
1
|
3 suppressPackageStartupMessages(library(parallel))
|
|
4
|
0
|
5
|
|
6 gff_cleanup = function(gff){
|
|
7 ## remove overlapin annotation track - assign new annot
|
|
8 gff_disjoin = disjoin(gff, with.revmap=TRUE)
|
|
9 ## append annotation:
|
|
10 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
|
|
11 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
|
|
12 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
|
|
13 new_annot_uniq = unique(new_annot)
|
|
14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
|
|
15 names(lca_annot) = new_annot_uniq
|
|
16 new_annot_lca = lca_annot[new_annot]
|
|
17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
|
|
18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
|
|
19 gff_disjoin$strands=strand_attribute
|
|
20 gff_disjoin$source="RM"
|
|
21 gff_disjoin$type="repeat"
|
|
22 gff_disjoin$score=NA
|
|
23 gff_disjoin$phase=NA
|
|
24 gff_disjoin$Name=new_annot_lca
|
|
25 gff_disjoin$Original_names=new_annot
|
|
26 gff_disjoin$revmap=NULL
|
|
27 return(gff_disjoin)
|
|
28 }
|
|
29
|
|
30 resolve_name=function(x){
|
|
31 if (length(x)==1){
|
|
32 # no conflict
|
|
33 return(x)
|
|
34 } else{
|
|
35 y = sapply(x, strsplit, split="/", fixed = TRUE)
|
|
36 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x))))
|
|
37 if (max(ny)<length(x)){
|
|
38 return("Unknown")
|
|
39 }else{
|
|
40 k = which(ny==length(x))
|
|
41 r = max(as.numeric((gsub(" .+","",names(k)))))
|
|
42 out = paste(y[[1]][1:r], collapse="/")
|
|
43 return(out)
|
|
44 }
|
|
45 }
|
|
46 }
|
|
47
|
|
48
|
|
49
|
|
50 infile = commandArgs(T)[1]
|
|
51 outfile = commandArgs(T)[2]
|
|
52
|
|
53 ## infile = "./test_data/raw_rm.out"
|
|
54
|
1
|
55 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16))
|
0
|
56
|
|
57 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
|
|
58
|
|
59 # repeat class after # symbol - syntax 1
|
|
60 gff$Name=rm_out$V11
|
|
61
|
|
62 ## is repeat type is specifies by double underscore:
|
|
63 ## then rm_out$V11 is unspecified
|
|
64 if (any(rm_out$V11 == "Unspecified")){
|
|
65 ## set Name from prefix
|
|
66 ## TODO
|
|
67 inc = rm_out$V11 == "Unspecified"
|
|
68 Name = gsub("__.+","",rm_out$V10)
|
|
69 # chanche Usnpsecified to new name
|
|
70 gff$Name[inc] = Name
|
|
71 }
|
|
72
|
|
73
|
|
74 ## join neighbors with the same annotation, disregard strand!
|
|
75 result <- unlist(reduce(split(gff, gff$Name)))
|
|
76
|
|
77 result$Name <- names(result)
|
|
78
|
|
79 result_clean = gff_cleanup(result)
|
|
80
|
|
81 ## TODO
|
|
82 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
|
|
83
|
|
84 gff_out = sortSeqlevels(result_clean)
|
|
85 gff_out = sort(gff_out)
|
|
86 gff_out$type = "repeat_region"
|
|
87 gff_out$source = "RepeatMasker_parsed"
|
|
88 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name))
|
|
89 export(gff_out, format = "gff3", con=outfile)
|
|
90
|
|
91
|