| 
0
 | 
     1 #!/usr/bin/env Rscript
 | 
| 
 | 
     2 ## prepare protein database in format suitable fro repeatexplorer search
 | 
| 
 | 
     3 library(Biostrings)
 | 
| 
 | 
     4 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/new_protein_domains_prelim/coded01/ALL_protein-domains_05.fasta")
 | 
| 
 | 
     5                                         # this cannot be used - __ is also in element id!!!
 | 
| 
 | 
     6                                         # element_names = gsub("^.+__","",names(domains))  #
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 # this shou be version 3
 | 
| 
 | 
     9 domains = readAAStringSet("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_protein-domains.fasta")
 | 
| 
 | 
    10                                         # this cannot be used - __ is also in element id!!!
 | 
| 
 | 
    11                                         # element_names = gsub("^.+__","",names(domains))  #
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 element_names = sapply(strsplit(names(domains),split="__"),function(x)paste(x[-1],collapse="__"))
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 classification = readLines("/mnt/raid/454_data/databases/protein_domains/Viridiplantae_v001/Viridiplantae_v001_ALL_classification")
 | 
| 
 | 
    16 ## classification contain slash in categories - must be replaced with underscore
 | 
| 
 | 
    17 classification = gsub("/","_",classification)
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 names(classification) = sapply (strsplit(classification, split="\t"),"[[", 1)
 | 
| 
 | 
    20 classification_formated = sapply (sapply(strsplit(classification, "\t"), "[",-1), paste, collapse="/")
 | 
| 
 | 
    21 domain_name = gsub("__.+","",names(domains))
 | 
| 
 | 
    22 table(domain_name)
 | 
| 
 | 
    23 full_names = paste0(names(domains),"#", classification_formated[element_names],':', domain_name)
 | 
| 
 | 
    24 head(full_names)
 | 
| 
 | 
    25 names(domains) = full_names
 | 
| 
 | 
    26 writeXStringSet(domains,"/mnt/raid/users/petr/workspace/repex_tarean/databases/protein_database_viridiplantae_v3.0.fasta")
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 library(data.tree)
 | 
| 
 | 
    30 library(treemap)
 | 
| 
 | 
    31 data(GNI2014)
 | 
| 
 | 
    32 class(GNI2014)
 | 
| 
 | 
    33 head(classification_formated)
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 ## compile all classification together
 | 
| 
 | 
    36 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database.fasta")
 | 
| 
 | 
    37 dna_dat = readDNAStringSet("/mnt/raid/users/petr/workspace/repex_tarean/databases/dna_database_masked.fasta")
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 add_weight = function(i, name){
 | 
| 
 | 
    40     if (is.null(i$parent[[name]])){
 | 
| 
 | 
    41         i$parent[[name]] = i[[name]]
 | 
| 
 | 
    42     }else{
 | 
| 
 | 
    43         i$parent[[name]] = i[[name]] + i$parent[[name]]
 | 
| 
 | 
    44     }
 | 
| 
 | 
    45     if (i$parent$level == 1){
 | 
| 
 | 
    46         return()
 | 
| 
 | 
    47     }else{
 | 
| 
 | 
    48         add_weight(i$parent, name)
 | 
| 
 | 
    49     }
 | 
| 
 | 
    50 }
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 cls_string = c(
 | 
| 
 | 
    54     "All/contamination",
 | 
| 
 | 
    55     "All/organelle/plastid",
 | 
| 
 | 
    56     "All/organelle/mitochondria",
 | 
| 
 | 
    57     "All/repeat/rDNA/45S_rDNA/18S_rDNA",
 | 
| 
 | 
    58     "All/repeat/rDNA/45S_rDNA/25S_rDNA",
 | 
| 
 | 
    59     "All/repeat/rDNA/45S_rDNA/5.8S_rDNA",
 | 
| 
 | 
    60     "All/repeat/rDNA/5S_rDNA",
 | 
| 
 | 
    61     "All/repeat/satellite",
 | 
| 
 | 
    62     "All/repeat/mobile_element/Class_I/SINE",
 | 
| 
 | 
    63     "All/repeat/mobile_element/Class_II/Subclass_1/TIR/MITE"
 | 
| 
 | 
    64 )
 | 
| 
 | 
    65 cls_full_name =c(
 | 
| 
 | 
    66     "contamination",
 | 
| 
 | 
    67     "organelle/plastid",
 | 
| 
 | 
    68     "organelle/mitochondria",
 | 
| 
 | 
    69     "45S_rDNA/18S_rDNA",
 | 
| 
 | 
    70     "45S_rDNA/25S_rDNA",
 | 
| 
 | 
    71     "45S_rDNA/5.8S_rDNA",
 | 
| 
 | 
    72     "5S_rDNA/5S_rDNA",
 | 
| 
 | 
    73     "satellite",
 | 
| 
 | 
    74     "Class_I/SINE",
 | 
| 
 | 
    75     "Class_II/Subclass_1/TIR/MITE"
 | 
| 
 | 
    76 )
 | 
| 
 | 
    77 
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 df1 = data.frame(pathString = cls_string, full_name=cls_full_name, stringsAsFactors = FALSE, nhits = 0,domains=0, prop=0, mean_weight=0, total_weight=0)
 | 
| 
 | 
    81 df2 = data.frame(pathString = paste("All/repeat/mobile_element",unique(classification_formated),sep="/"), full_name= unique(classification_formated), stringsAsFactors = FALSE,  nhits = 0, domains = 0, prop =0, mean_weight=0, total_weight = 0)
 | 
| 
 | 
    82 cls_tree = as.Node (rbind(df1,df2))
 | 
| 
 | 
    83 saveRDS(object = cls_tree, file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree_viridiplantae_v3.0.rds")
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 print(cls_tree, "nhits", 'domains')
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 names(cls_tree)
 | 
| 
 | 
    88 cls_tree$leaves[[3]]$mean_weight
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 cls_tree$leaves[[1]]$mean_weight
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 
 | 
| 
 | 
    94 ## add to nodes
 | 
| 
 | 
    95 add_value_to_nodes = function(tr, name="weight"){
 | 
| 
 | 
    96   for (i in Traverse(tr)){
 | 
| 
 | 
    97       w = sum(sapply(i$leaves,"[[",name))
 | 
| 
 | 
    98       i[[name]] = w
 | 
| 
 | 
    99   }
 | 
| 
 | 
   100   return(tr)
 | 
| 
 | 
   101 }
 | 
| 
 | 
   102 
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 tr2 = add_value_to_nodes(cls_tree, name="nhits")
 | 
| 
 | 
   105 print(tr2,"nhits")
 | 
| 
 | 
   106 
 | 
| 
 | 
   107 ## add_nhits
 | 
| 
 | 
   108 for (i in cls_tree$leaves){
 | 
| 
 | 
   109     cls_string = i$full_name
 | 
| 
 | 
   110     ## go to the root:
 | 
| 
 | 
   111     nhits = i$nhits
 | 
| 
 | 
   112     prop = i$prop
 | 
| 
 | 
   113     mean_weight = i$mean_weight
 | 
| 
 | 
   114     add_weight(i, "nhits")
 | 
| 
 | 
   115 }
 | 
| 
 | 
   116 
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 
 |