Mercurial > repos > petr-novak > repeat_annotation_pipeline2
diff clean_rm_output.R @ 3:e955b40ad3a4 draft default tip
Uploaded
author | petr-novak |
---|---|
date | Tue, 12 Oct 2021 07:43:54 +0000 (2021-10-12) |
parents | cf3cea0a3039 |
children |
line wrap: on
line diff
--- a/clean_rm_output.R Thu Oct 07 07:29:59 2021 +0000 +++ b/clean_rm_output.R Tue Oct 12 07:43:54 2021 +0000 @@ -8,16 +8,43 @@ gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") + new_annot_uniq = unique(new_annot) + lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) + names(lca_annot) = new_annot_uniq + new_annot_lca = lca_annot[new_annot] + #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") gff_disjoin$strands=strand_attribute gff_disjoin$source="RM" gff_disjoin$type="repeat" gff_disjoin$score=NA gff_disjoin$phase=NA - gff_disjoin$Name=new_annot + gff_disjoin$Name=new_annot_lca + gff_disjoin$Original_names=new_annot gff_disjoin$revmap=NULL return(gff_disjoin) } + +resolve_name=function(x){ + if (length(x)==1){ + # no conflict + return(x) + } else{ + y = sapply(x, strsplit, split="/", fixed = TRUE) + ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x)))) + if (max(ny)<length(x)){ + return("Unknown") + }else{ + k = which(ny==length(x)) + r = max(as.numeric((gsub(" .+","",names(k))))) + out = paste(y[[1]][1:r], collapse="/") + return(out) + } + } +} + + + infile = commandArgs(T)[1] outfile = commandArgs(T)[2] @@ -44,12 +71,15 @@ ## join neighbors with the same annotation, disregard strand! result <- unlist(reduce(split(gff, gff$Name))) + result$Name <- names(result) +result_clean = gff_cleanup(result) + ## TODO ## identify conflicting annotation, replace by LCA but keep origin list of classifications -gff_out = sortSeqlevels(result) +gff_out = sortSeqlevels(result_clean) gff_out = sort(gff_out) gff_out$type = "repeat_region" gff_out$source = "RepeatMasker_parsed"