Mercurial > repos > petr-novak > repeat_annotation_pipeline3
comparison clean_rm_output.R @ 2:7f1032da7a0a draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 21 Feb 2022 10:35:13 +0000 |
parents | 814cba36e435 |
children |
comparison
equal
deleted
inserted
replaced
1:814cba36e435 | 2:7f1032da7a0a |
---|---|
14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) | 14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) |
15 names(lca_annot) = new_annot_uniq | 15 names(lca_annot) = new_annot_uniq |
16 new_annot_lca = lca_annot[new_annot] | 16 new_annot_lca = lca_annot[new_annot] |
17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) | 17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) |
18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") | 18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") |
19 gff_disjoin$strands=strand_attribute | |
20 gff_disjoin$source="RM" | 19 gff_disjoin$source="RM" |
21 gff_disjoin$type="repeat" | 20 gff_disjoin$type="repeat" |
22 gff_disjoin$score=NA | 21 gff_disjoin$score=NA |
23 gff_disjoin$phase=NA | 22 gff_disjoin$phase=NA |
24 gff_disjoin$Name=new_annot_lca | 23 gff_disjoin$Name=new_annot_lca |
25 gff_disjoin$Original_names=new_annot | 24 gff_disjoin$Original_names=new_annot |
25 gff_disjoin$strands=strand_attribute | |
26 gff_disjoin$revmap=NULL | 26 gff_disjoin$revmap=NULL |
27 return(gff_disjoin) | 27 return(gff_disjoin) |
28 } | 28 } |
29 | 29 |
30 resolve_name=function(x){ | 30 resolve_name=function(x){ |
43 return(out) | 43 return(out) |
44 } | 44 } |
45 } | 45 } |
46 } | 46 } |
47 | 47 |
48 convert_names <- function(n, old_sep = "|" , new_sep = "\""){ | |
49 # remove all characters which are new_sep with - | |
50 n_new = gsub(old_sep, new_sep, | |
51 gsub(new_sep,"-", n, fixed = TRUE), | |
52 fixed = TRUE) | |
53 return(n_new) | |
54 } | |
48 | 55 |
49 | 56 |
50 infile = commandArgs(T)[1] | 57 infile = commandArgs(T)[1] |
51 outfile = commandArgs(T)[2] | 58 outfile = commandArgs(T)[2] |
59 | |
52 | 60 |
53 ## infile = "./test_data/raw_rm.out" | 61 ## infile = "./test_data/raw_rm.out" |
54 | 62 |
55 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) | 63 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) |
56 | 64 |
57 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) | 65 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) |
58 | 66 |
59 # repeat class after # symbol - syntax 1 | 67 # repeat class after # symbol - syntax 1 |
60 gff$Name=rm_out$V11 | 68 # detect separator |
69 # if "|" is present replace "|" -> "/" and "/" -> "-" | |
70 if (any(grepl("|", rm_out$V11, fixed = TRUE))){ | |
71 gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/") | |
72 message('replacing classification separator character "|" with "/"') | |
73 print(gff) | |
74 }else{ | |
75 gff$Name <- rm_out$V11 | |
76 } | |
61 | 77 |
62 ## is repeat type is specifies by double underscore: | 78 ## is repeat type is specifies by double underscore: |
63 ## then rm_out$V11 is unspecified | 79 ## then rm_out$V11 is unspecified |
64 if (any(rm_out$V11 == "Unspecified")){ | 80 if (any(rm_out$V11 == "Unspecified")){ |
65 ## set Name from prefix | 81 ## set Name from prefix |