annotate databases/create_protein_database_viridiplantae_v3.0.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 ## prepare protein database in format suitable fro repeatexplorer search
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 library(Biostrings)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/new_protein_domains_prelim/coded01/ALL_protein-domains_05.fasta")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 # this cannot be used - __ is also in element id!!!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 # element_names = gsub("^.+__","",names(domains)) #
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 # this shou be version 3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_protein-domains.fasta")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 # this cannot be used - __ is also in element id!!!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 # element_names = gsub("^.+__","",names(domains)) #
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 element_names = sapply(strsplit(names(domains),split="__"),function(x)paste(x[-1],collapse="__"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 classification = readLines("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_classification")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 ## classification contain slash in categories - must be replaced with underscore
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 classification = gsub("/","_",classification)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 names(classification) = sapply (strsplit(classification, split="\t"),"[[", 1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 classification_formated = sapply (sapply(strsplit(classification, "\t"), "[",-1), paste, collapse="/")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21 domain_name = gsub("__.+","",names(domains))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 table(domain_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 full_names = paste0(names(domains),"#", classification_formated[element_names],':', domain_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 head(full_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 names(domains) = full_names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26 writeXStringSet(domains,"/mnt/raid/users/petr/workspace/repex_tarean/databases/protein_database_viridiplantae_v3.0.fasta")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 library(data.tree)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 library(treemap)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 data(GNI2014)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 class(GNI2014)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 head(classification_formated)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 ## compile all classification together
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database.fasta")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database_masked.fasta")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 add_weight = function(i, name){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 if (is.null(i$parent[[name]])){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 i$parent[[name]] = i[[name]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 }else{
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 i$parent[[name]] = i[[name]] + i$parent[[name]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 if (i$parent$level == 1){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 return()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 }else{
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 add_weight(i$parent, name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 cls_string = c(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 "All/contamination",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 "All/organelle/plastid",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 "All/organelle/mitochondria",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 "All/repeat/rDNA/45S_rDNA/18S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 "All/repeat/rDNA/45S_rDNA/25S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 "All/repeat/rDNA/45S_rDNA/5.8S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 "All/repeat/rDNA/5S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 "All/repeat/satellite",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 "All/repeat/mobile_element/Class_I/SINE",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 "All/repeat/mobile_element/Class_II/Subclass_1/TIR/MITE"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 cls_full_name =c(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 "contamination",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 "organelle/plastid",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 "organelle/mitochondria",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 "45S_rDNA/18S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 "45S_rDNA/25S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 "45S_rDNA/5.8S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 "5S_rDNA/5S_rDNA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73 "satellite",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 "Class_I/SINE",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 "Class_II/Subclass_1/TIR/MITE"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 df1 = data.frame(pathString = cls_string, full_name=cls_full_name, stringsAsFactors = FALSE, nhits = 0,domains=0, prop=0, mean_weight=0, total_weight=0)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 df2 = data.frame(pathString = paste("All/repeat/mobile_element",unique(classification_formated),sep="/"), full_name= unique(classification_formated), stringsAsFactors = FALSE, nhits = 0, domains = 0, prop =0, mean_weight=0, total_weight = 0)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 cls_tree = as.Node (rbind(df1,df2))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83 saveRDS(object = cls_tree, file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree_viridiplantae_v3.0.rds")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 print(cls_tree, "nhits", 'domains')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 names(cls_tree)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 cls_tree$leaves[[3]]$mean_weight
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 cls_tree$leaves[[1]]$mean_weight
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 ## add to nodes
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 add_value_to_nodes = function(tr, name="weight"){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 for (i in Traverse(tr)){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 w = sum(sapply(i$leaves,"[[",name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 i[[name]] = w
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 return(tr)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 tr2 = add_value_to_nodes(cls_tree, name="nhits")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 print(tr2,"nhits")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 ## add_nhits
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 for (i in cls_tree$leaves){
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 cls_string = i$full_name
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 ## go to the root:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 nhits = i$nhits
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112 prop = i$prop
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113 mean_weight = i$mean_weight
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 add_weight(i, "nhits")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119