Mercurial > repos > petr-novak > repeatrxplorer
diff databases/create_protein_database_viridiplantae_v3.0.R @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/databases/create_protein_database_viridiplantae_v3.0.R Thu Dec 19 10:24:45 2019 -0500 @@ -0,0 +1,119 @@ +#!/usr/bin/env Rscript +## prepare protein database in format suitable fro repeatexplorer search +library(Biostrings) +domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/new_protein_domains_prelim/coded01/ALL_protein-domains_05.fasta") + # this cannot be used - __ is also in element id!!! + # element_names = gsub("^.+__","",names(domains)) # + +# this shou be version 3 +domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_protein-domains.fasta") + # this cannot be used - __ is also in element id!!! + # element_names = gsub("^.+__","",names(domains)) # + +element_names = sapply(strsplit(names(domains),split="__"),function(x)paste(x[-1],collapse="__")) + +classification = readLines("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_classification") +## classification contain slash in categories - must be replaced with underscore +classification = gsub("/","_",classification) + +names(classification) = sapply (strsplit(classification, split="\t"),"[[", 1) +classification_formated = sapply (sapply(strsplit(classification, "\t"), "[",-1), paste, collapse="/") +domain_name = gsub("__.+","",names(domains)) +table(domain_name) +full_names = paste0(names(domains),"#", classification_formated[element_names],':', domain_name) +head(full_names) +names(domains) = full_names +writeXStringSet(domains,"/mnt/raid/users/petr/workspace/repex_tarean/databases/protein_database_viridiplantae_v3.0.fasta") + + +library(data.tree) +library(treemap) +data(GNI2014) +class(GNI2014) +head(classification_formated) + +## compile all classification together +dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database.fasta") +dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database_masked.fasta") + +add_weight = function(i, name){ + if (is.null(i$parent[[name]])){ + i$parent[[name]] = i[[name]] + }else{ + i$parent[[name]] = i[[name]] + i$parent[[name]] + } + if (i$parent$level == 1){ + return() + }else{ + add_weight(i$parent, name) + } +} + + +cls_string = c( + "All/contamination", + "All/organelle/plastid", + "All/organelle/mitochondria", + "All/repeat/rDNA/45S_rDNA/18S_rDNA", + "All/repeat/rDNA/45S_rDNA/25S_rDNA", + "All/repeat/rDNA/45S_rDNA/5.8S_rDNA", + "All/repeat/rDNA/5S_rDNA", + "All/repeat/satellite", + "All/repeat/mobile_element/Class_I/SINE", + "All/repeat/mobile_element/Class_II/Subclass_1/TIR/MITE" +) +cls_full_name =c( + "contamination", + "organelle/plastid", + "organelle/mitochondria", + "45S_rDNA/18S_rDNA", + "45S_rDNA/25S_rDNA", + "45S_rDNA/5.8S_rDNA", + "5S_rDNA/5S_rDNA", + "satellite", + "Class_I/SINE", + "Class_II/Subclass_1/TIR/MITE" +) + + + +df1 = data.frame(pathString = cls_string, full_name=cls_full_name, stringsAsFactors = FALSE, nhits = 0,domains=0, prop=0, mean_weight=0, total_weight=0) +df2 = data.frame(pathString = paste("All/repeat/mobile_element",unique(classification_formated),sep="/"), full_name= unique(classification_formated), stringsAsFactors = FALSE, nhits = 0, domains = 0, prop =0, mean_weight=0, total_weight = 0) +cls_tree = as.Node (rbind(df1,df2)) +saveRDS(object = cls_tree, file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree_viridiplantae_v3.0.rds") + +print(cls_tree, "nhits", 'domains') + +names(cls_tree) +cls_tree$leaves[[3]]$mean_weight + +cls_tree$leaves[[1]]$mean_weight + + + +## add to nodes +add_value_to_nodes = function(tr, name="weight"){ + for (i in Traverse(tr)){ + w = sum(sapply(i$leaves,"[[",name)) + i[[name]] = w + } + return(tr) +} + + +tr2 = add_value_to_nodes(cls_tree, name="nhits") +print(tr2,"nhits") + +## add_nhits +for (i in cls_tree$leaves){ + cls_string = i$full_name + ## go to the root: + nhits = i$nhits + prop = i$prop + mean_weight = i$mean_weight + add_weight(i, "nhits") +} + + + +