Mercurial > repos > petr-novak > repeatrxplorer
comparison databases/create_protein_database_viridiplantae_v3.0.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d1b9e1b2e2f |
---|---|
1 #!/usr/bin/env Rscript | |
2 ## prepare protein database in format suitable fro repeatexplorer search | |
3 library(Biostrings) | |
4 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/new_protein_domains_prelim/coded01/ALL_protein-domains_05.fasta") | |
5 # this cannot be used - __ is also in element id!!! | |
6 # element_names = gsub("^.+__","",names(domains)) # | |
7 | |
8 # this shou be version 3 | |
9 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_protein-domains.fasta") | |
10 # this cannot be used - __ is also in element id!!! | |
11 # element_names = gsub("^.+__","",names(domains)) # | |
12 | |
13 element_names = sapply(strsplit(names(domains),split="__"),function(x)paste(x[-1],collapse="__")) | |
14 | |
15 classification = readLines("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_classification") | |
16 ## classification contain slash in categories - must be replaced with underscore | |
17 classification = gsub("/","_",classification) | |
18 | |
19 names(classification) = sapply (strsplit(classification, split="\t"),"[[", 1) | |
20 classification_formated = sapply (sapply(strsplit(classification, "\t"), "[",-1), paste, collapse="/") | |
21 domain_name = gsub("__.+","",names(domains)) | |
22 table(domain_name) | |
23 full_names = paste0(names(domains),"#", classification_formated[element_names],':', domain_name) | |
24 head(full_names) | |
25 names(domains) = full_names | |
26 writeXStringSet(domains,"/mnt/raid/users/petr/workspace/repex_tarean/databases/protein_database_viridiplantae_v3.0.fasta") | |
27 | |
28 | |
29 library(data.tree) | |
30 library(treemap) | |
31 data(GNI2014) | |
32 class(GNI2014) | |
33 head(classification_formated) | |
34 | |
35 ## compile all classification together | |
36 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database.fasta") | |
37 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database_masked.fasta") | |
38 | |
39 add_weight = function(i, name){ | |
40 if (is.null(i$parent[[name]])){ | |
41 i$parent[[name]] = i[[name]] | |
42 }else{ | |
43 i$parent[[name]] = i[[name]] + i$parent[[name]] | |
44 } | |
45 if (i$parent$level == 1){ | |
46 return() | |
47 }else{ | |
48 add_weight(i$parent, name) | |
49 } | |
50 } | |
51 | |
52 | |
53 cls_string = c( | |
54 "All/contamination", | |
55 "All/organelle/plastid", | |
56 "All/organelle/mitochondria", | |
57 "All/repeat/rDNA/45S_rDNA/18S_rDNA", | |
58 "All/repeat/rDNA/45S_rDNA/25S_rDNA", | |
59 "All/repeat/rDNA/45S_rDNA/5.8S_rDNA", | |
60 "All/repeat/rDNA/5S_rDNA", | |
61 "All/repeat/satellite", | |
62 "All/repeat/mobile_element/Class_I/SINE", | |
63 "All/repeat/mobile_element/Class_II/Subclass_1/TIR/MITE" | |
64 ) | |
65 cls_full_name =c( | |
66 "contamination", | |
67 "organelle/plastid", | |
68 "organelle/mitochondria", | |
69 "45S_rDNA/18S_rDNA", | |
70 "45S_rDNA/25S_rDNA", | |
71 "45S_rDNA/5.8S_rDNA", | |
72 "5S_rDNA/5S_rDNA", | |
73 "satellite", | |
74 "Class_I/SINE", | |
75 "Class_II/Subclass_1/TIR/MITE" | |
76 ) | |
77 | |
78 | |
79 | |
80 df1 = data.frame(pathString = cls_string, full_name=cls_full_name, stringsAsFactors = FALSE, nhits = 0,domains=0, prop=0, mean_weight=0, total_weight=0) | |
81 df2 = data.frame(pathString = paste("All/repeat/mobile_element",unique(classification_formated),sep="/"), full_name= unique(classification_formated), stringsAsFactors = FALSE, nhits = 0, domains = 0, prop =0, mean_weight=0, total_weight = 0) | |
82 cls_tree = as.Node (rbind(df1,df2)) | |
83 saveRDS(object = cls_tree, file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree_viridiplantae_v3.0.rds") | |
84 | |
85 print(cls_tree, "nhits", 'domains') | |
86 | |
87 names(cls_tree) | |
88 cls_tree$leaves[[3]]$mean_weight | |
89 | |
90 cls_tree$leaves[[1]]$mean_weight | |
91 | |
92 | |
93 | |
94 ## add to nodes | |
95 add_value_to_nodes = function(tr, name="weight"){ | |
96 for (i in Traverse(tr)){ | |
97 w = sum(sapply(i$leaves,"[[",name)) | |
98 i[[name]] = w | |
99 } | |
100 return(tr) | |
101 } | |
102 | |
103 | |
104 tr2 = add_value_to_nodes(cls_tree, name="nhits") | |
105 print(tr2,"nhits") | |
106 | |
107 ## add_nhits | |
108 for (i in cls_tree$leaves){ | |
109 cls_string = i$full_name | |
110 ## go to the root: | |
111 nhits = i$nhits | |
112 prop = i$prop | |
113 mean_weight = i$mean_weight | |
114 add_weight(i, "nhits") | |
115 } | |
116 | |
117 | |
118 | |
119 |