annotate clean_rm_output.R @ 3:e955b40ad3a4 draft default tip

Uploaded
author petr-novak
date Tue, 12 Oct 2021 07:43:54 +0000
parents cf3cea0a3039
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
2 suppressPackageStartupMessages(library(rtracklayer))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
3
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
4 gff_cleanup = function(gff){
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
5 ## remove overlapin annotation track - assign new annot
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
6 gff_disjoin = disjoin(gff, with.revmap=TRUE)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
7 ## append annotation:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
11 new_annot_uniq = unique(new_annot)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
12 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
13 names(lca_annot) = new_annot_uniq
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
14 new_annot_lca = lca_annot[new_annot]
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
15 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
16 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
17 gff_disjoin$strands=strand_attribute
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
18 gff_disjoin$source="RM"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
19 gff_disjoin$type="repeat"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
20 gff_disjoin$score=NA
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
21 gff_disjoin$phase=NA
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
22 gff_disjoin$Name=new_annot_lca
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
23 gff_disjoin$Original_names=new_annot
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
24 gff_disjoin$revmap=NULL
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
25 return(gff_disjoin)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
26 }
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
27
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
28 resolve_name=function(x){
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
29 if (length(x)==1){
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
30 # no conflict
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
31 return(x)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
32 } else{
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
33 y = sapply(x, strsplit, split="/", fixed = TRUE)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
34 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x))))
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
35 if (max(ny)<length(x)){
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
36 return("Unknown")
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
37 }else{
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
38 k = which(ny==length(x))
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
39 r = max(as.numeric((gsub(" .+","",names(k)))))
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
40 out = paste(y[[1]][1:r], collapse="/")
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
41 return(out)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
42 }
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
43 }
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
44 }
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
45
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
46
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
47
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
48 infile = commandArgs(T)[1]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
49 outfile = commandArgs(T)[2]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
50
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
51 ## infile = "./test_data/raw_rm.out"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
52
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
53 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
54
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
55 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
56
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
57 # repeat class after # symbol - syntax 1
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
58 gff$Name=rm_out$V11
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
59
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
60 ## is repeat type is specifies by double underscore:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
61 ## then rm_out$V11 is unspecified
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
62 if (any(rm_out$V11 == "Unspecified")){
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
63 ## set Name from prefix
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
64 ## TODO
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
65 inc = rm_out$V11 == "Unspecified"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
66 Name = gsub("__.+","",rm_out$V10)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
67 # chanche Usnpsecified to new name
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
68 gff$Name[inc] = Name
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
69 }
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
70
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
71
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
72 ## join neighbors with the same annotation, disregard strand!
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
73 result <- unlist(reduce(split(gff, gff$Name)))
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
74
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
75 result$Name <- names(result)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
76
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
77 result_clean = gff_cleanup(result)
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
78
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
79 ## TODO
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
80 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
81
3
e955b40ad3a4 Uploaded
petr-novak
parents: 0
diff changeset
82 gff_out = sortSeqlevels(result_clean)
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
83 gff_out = sort(gff_out)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
84 gff_out$type = "repeat_region"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
85 gff_out$source = "RepeatMasker_parsed"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
86 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
87 export(gff_out, format = "gff3", con=outfile)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
88
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
89