comparison R/ltr_utils.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents 6ae4a341d1f3
children 9de392f2fc02
comparison
equal deleted inserted replaced
6:b91ca438a1cb 7:c33d6583e548
9 gff_updated$downstream_domain <- unlist(downstreams) 9 gff_updated$downstream_domain <- unlist(downstreams)
10 names(gff_updated) <- NULL 10 names(gff_updated) <- NULL
11 return(gff_updated) 11 return(gff_updated)
12 } 12 }
13 13
14 get_domain_clusters_alt <- function(gff, dist_models, threshold=0.99){
15 gff <- sort(gff, by = ~ seqnames * start)
16 domain_pairs <- data.frame(
17 D1 = paste0(head(gff$Name,-1),"_S"),
18 D2 = paste0(gff$Name[-1],"_S"),
19 C1 = head(gff$Final_Classification,-1),
20 C2 = gff$Final_Classification[-1],
21 S1 = head(strand(gff),-1),
22 S2 = strand(gff)[-1],
23 start1 = head(start(gff),-1),
24 start2 = start(gff)[-1],
25 chrpos= paste0(seqnames(gff)[-1],"_",start(gff)[-1])
26 )
27 domain_pairs$D_A <- with(domain_pairs, ifelse(D1 < D2, D1, D2))
28 domain_pairs$D_B <- with(domain_pairs, ifelse(D1 > D2, D1, D2))
29 domain_pairs$quantile <- 1
30 same_element_cluster <- domain_pairs$S1 == domain_pairs$S2 & domain_pairs$C1 == domain_pairs$C2
31 domain_pairs$same_element_cluster <- same_element_cluster
32
33 q_plus_function <- apply(domain_pairs[same_element_cluster,],
34 1,
35 function(x) dist_models$plus[[x["C1"]]][[x["D_A"]]][[x["D_B"]]])
36
37 D <- abs(domain_pairs[same_element_cluster,]$start1 - domain_pairs[same_element_cluster,]$start2)
38
39 q_plus_value <- mapply(function(x, D)if(is.null(x)){0}else{x(D)}, x= q_plus_function, D = D)
40
41 domain_pairs$quantile[same_element_cluster] <- q_plus_value
42
43 domain_pairs$split_position <- !same_element_cluster | domain_pairs$quantile > threshold
44 # combine clusters based on distances and original cluster
45 clusters <- paste(cumsum(c(TRUE, domain_pairs$split_position)), get_domain_clusters(gff))
46 return(clusters)
47 }
48
49
14 get_domain_clusters <- function(gff) { 50 get_domain_clusters <- function(gff) {
15 # get consecutive domains from same linage 51 # get consecutive domains from same linage
16 # must be sorted! 52 # must be sorted!
17 gag_plus <- as.numeric(cumsum(gff$Name == "GAG" & strand(gff) == '+')) 53 plus_order_split <- c(0, as.numeric(cumsum(head(gff$domain_order, -1) >= gff$domain_order[-1] & strand(gff)[-1] == '+')))
18 gag_minus <- rev(as.numeric(cumsum(rev(gff$Name == "GAG" & strand(gff) == '-')))) 54 minus_order_split <- rev(as.numeric(cumsum(rev(c(gff$domain_order[-1] >= head(gff$domain_order,-1) & strand(gff)[-1] == '-', FALSE)))))
19 # split based on classification - must be same and consecutive 55 # split based on classification - must be same and consecutive
20 x <- rle(gff$Final_Classification) 56 x <- rle(gff$Final_Classification)
21 # split on strand change 57 # split on strand change
22 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff))) 58 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff)))
23 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff), 59 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff),
24 "_", gag_plus, "_", gag_minus, "_", s) 60 "_", plus_order_split, "_", minus_order_split, "_", s)
25 gff_clusters <- split(as.data.frame(gff), factor(domain_cluster, levels = unique(domain_cluster))) 61 clusters <- factor(domain_cluster, levels = unique(domain_cluster))
26 gff_clusters 62 return(clusters)
27 } 63 }
28 64
29 clean_domain_clusters <- function(gcl) { 65 # create partial TE from clusters
66 get_partial_te_from_cluster_of_domains <- function (gpart){
67 ID <- paste("TE_partial_", gpart$Cluster[1], sep="")
68 te_partial <- GRanges(type="transposable_element_partial",
69 strand=strand(gpart)[1],
70 ID = ID,
71 source = "dante_ltr",
72 seqnames=seqnames(gpart)[1],
73 Final_Classification=gpart$Final_Classification[1],
74 Name=gpart$Final_Classification[1],
75 IRanges(min(start(gpart)), max(end(gpart))))
76 gpart$Parent <- ID
77 return(c(te_partial,gpart))
78 }
79
80 # create partial TE from clusters
81 get_partial_te_from_cluster_of_domains <- function(gpart, ID) {
82 te_partial <- makeGRangesFromDataFrame(
83 data.frame(type = "transposable_element_partial",
84 strand = gpart$strand[1],
85 ID = ID,
86 source = "dante_ltr",
87 seqnames = gpart$seqnames[1],
88 Final_Classification =
89 gpart$Final_Classification[1],
90 Name = gpart$Final_Classification[1],
91 IRanges(min(gpart$start), max(gpart$end))),
92 keep.extra.columns = TRUE)
93
94
95 gpart$Parent <- ID
96 gpart_gr = makeGRangesFromDataFrame(gpart, keep.extra.columns = TRUE)
97 return(c(te_partial, gpart_gr))
98 }
99
100 # function to count for each element number of occurences:
101 count_occurences_for_each_element <- function(x) {
102 counts_unique <- table(x)
103 counts <- counts_unique[x]
104 counts
105 }
106
107
108 clean_domain_clusters <- function(gcl, lineage_domain_span, min_domains) {
30 ## remove clusters wich does not have enough domains or domains 109 ## remove clusters wich does not have enough domains or domains
31 ## are on different strand 110 ## are on different strand
32 N_domains <- sapply(gcl, nrow) 111 N_domains <- sapply(gcl, nrow)
33 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name))) 112 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name)))
34 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " ")) 113 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " "))
35 S_OK <- S %in% c("+", "-") 114 S_OK <- S %in% c("+", "-")
36 min_domains <- 5 115 max_span <- lineage_domain_span[sapply(gcl, function(x) x$Final_Classification[1])]
37 maxlength <- 15000 # max span between domains 116 # set to zero if lineage is not covered in lineage domain span
117 max_span[is.na(max_span)] <- 0
38 span <- sapply(gcl, function(x)max(x$end) - min(x$start)) 118 span <- sapply(gcl, function(x)max(x$end) - min(x$start))
39 cond1 <- S_OK & 119 cond1 <- S_OK &
40 N_unique_domains == N_domains & 120 N_unique_domains == N_domains &
41 N_domains >= min_domains & 121 N_domains >= min_domains &
42 span <= maxlength 122 span <= max_span
43 return(gcl[cond1]) 123 return(gcl[cond1])
44 } 124 }
45 125
46 check_ranges <- function(gx, s, offset = OFFSET) { 126 check_ranges <- function(gx, s, offset = OFFSET) {
47 # check is range is not out of sequence length 127 # check is range is not out of sequence length
90 }else { 170 }else {
91 return(max(start(x))) 171 return(max(start(x)))
92 } 172 }
93 } 173 }
94 174
175 domain_distance <- function(d_query, d_reference){
176 if (d_query == d_reference){
177 return (0)
178 }
179 d_query_p <- strsplit(d_query," ")[[1]]
180 d_reference_p <- strsplit(d_reference," ")[[1]]
181 d <- length(d_reference_p) - sum(d_query_p == d_reference_p[d_reference_p %in% d_query_p])
182 return(d)
183 }
95 184
96 185
97 trim2TGAC <- function(bl) { 186 trim2TGAC <- function(bl) {
98 for (i in 1:nrow(bl)) { 187 for (i in 1:nrow(bl)) {
99 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") 188 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?")
127 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){ 216 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){
128 if (ncpus == 1){ 217 if (ncpus == 1){
129 query <- list(seqs) 218 query <- list(seqs)
130 }else{ 219 }else{
131 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs)))) 220 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs))))
221 if (length(query) < ncpus){
222 ncpus <- length(query)
223 }
132 } 224 }
133 if(is.null(db)){ 225 if(is.null(db)){
134 # search against itself 226 # search against itself
135 db <- seqs 227 db <- seqs
136 } 228 }
181 no_hit = id_with_no_hits) 273 no_hit = id_with_no_hits)
182 ) 274 )
183 } 275 }
184 276
185 277
186 analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ 278 analyze_TE <- function(seqs, ncpus = 10, word_size = 20, perc_identity = 90, ...){
187 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = 90) 279 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = perc_identity, ...)
188 te_conflict_info <- identify_conflicts(blt) 280 te_conflict_info <- identify_conflicts(blt)
189 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) 281 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok)
190 te_ok_lineages <- split(blt_te_ok, 282 te_ok_lineages <- split(blt_te_ok,
191 gsub( 283 gsub(
192 ".+[|]", 284 ".+[|]",
196 FUN = get_representative_ranges, 288 FUN = get_representative_ranges,
197 mc.cores = ncpus 289 mc.cores = ncpus
198 )) 290 ))
199 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative)) 291 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative))
200 names(seqs_representative) <- seqnames(Reduce(c, gr_representative)) 292 names(seqs_representative) <- seqnames(Reduce(c, gr_representative))
201 # TODO - add swithin group verification here, ! exclude self hits!!
202 293
203 return( 294 return(
204 list( 295 list(
205 seqs_representative = seqs_representative, 296 seqs_representative = seqs_representative,
206 te_conflict_info = te_conflict_info, 297 te_conflict_info = te_conflict_info,
238 return(list(multiplicity = mp, 329 return(list(multiplicity = mp,
239 id_ok_mp_verified = id_ok_mp_verified)) 330 id_ok_mp_verified = id_ok_mp_verified))
240 331
241 } 332 }
242 333
243 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10){ 334 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10, ...){
244 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size) 335 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size, ...)
245 QL <- gsub(".+[|]", "", blt$qaccver) 336 QL <- gsub(".+[|]", "", blt$qaccver)
246 SL <- gsub(".+[|]", "", blt$saccver) 337 SL <- gsub(".+[|]", "", blt$saccver)
247 id_with_conflict <- unique(c(blt$qaccver[QL != SL])) 338 id_with_conflict <- unique(c(blt$qaccver[QL != SL]))
248 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict) 339 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict)
249 # check coverage hits 340 # check coverage hits
338 list(richness=R, shannon_index=SI) 429 list(richness=R, shannon_index=SI)
339 } 430 }
340 431
341 432
342 433
343 blast <- function(s1, s2) { 434 blast <- function(s1, s2, expected_aln_lenght=NULL) {
344 tmp1 <- tempfile() 435 tmp1 <- tempfile()
345 tmp2 <- tempfile() 436 tmp2 <- tempfile()
346 tmp_out <- tempfile() 437 tmp_out <- tempfile()
347 writeXStringSet(DNAStringSet(s1), tmp1) 438 writeXStringSet(DNAStringSet(s1), tmp1)
348 writeXStringSet(DNAStringSet(s2), tmp2) 439 writeXStringSet(DNAStringSet(s2), tmp2)
359 if (nrow(out_raw) == 0) { 450 if (nrow(out_raw) == 0) {
360 return(out_raw) 451 return(out_raw)
361 } 452 }
362 out <- trim2TGAC(out_raw) 453 out <- trim2TGAC(out_raw)
363 # remove alingment shorted that 454 # remove alingment shorted that
364 out <- out[out$length > 100,] 455 # out <- out[out$length > 100,]
456 # alngment must be at least 80% of expected LTRmin95
457 out <- out[out$length > (expected_aln_lenght *0.8),]
365 if (nrow(out) == 0) { 458 if (nrow(out) == 0) {
366 return(out) 459 return(out)
367 } 460 }
368 ## filter for TGCA 461 ## filter for TGCA
369 TG_L <- substring(out$qseq, 1, 2) == "TG" 462 TG_L <- substring(out$qseq, 1, 2) == "TG"
480 TSD$type <- "target_site_duplication" 573 TSD$type <- "target_site_duplication"
481 TSD$Parent <- ID 574 TSD$Parent <- ID
482 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence) 575 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence)
483 } 576 }
484 577
485 TE$Final_Classification <- D$Final_Classification[1] 578 TE$Name <- TE$Final_Classification <- D$Final_Classification[1]
486 579
487 D$Parent <- ID 580 D$Parent <- ID
488 out <- c(TE, LTR_3, LTR_5, D, TSD) 581 out <- c(TE, LTR_3, LTR_5, D, TSD)
489 return(out) 582 return(out)
490 } 583 }
491 584
492 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR) { 585 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR,LTR_length) {
493 xx <- blast(Lseq, Rseq) 586 xx <- blast(Lseq, Rseq, LTR_length)
494 if (nrow(xx) == 0) { 587 if (nrow(xx) == 0) {
495 return(NULL) 588 return(NULL)
496 }else { 589 }else {
497 ltr_tmp <- list() 590 ltr_tmp <- list()
498 for (j in 1:nrow(xx)) { 591 for (j in 1:nrow(xx)) {
589 Rank[ID %in% DLP_id] <- "DLP" 682 Rank[ID %in% DLP_id] <- "DLP"
590 Rank[ID %in% DLTP_id] <- "DLTP" 683 Rank[ID %in% DLTP_id] <- "DLTP"
591 return(Rank) 684 return(Rank)
592 } 685 }
593 686
594 get_te_statistics <- function(gr, RT) { 687
595 DOMAINS_LTR <- gr[gr$type == "transposable_element" & 688 get_te_statistics <- function(gr, RT){
596 gr$TSD == "not_found" & 689 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
597 is.na(gr$trna_id)]
598 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" &
599 gr$TSD != "not_found" &
600 is.na(gr$trna_id)]
601 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" &
602 gr$TSD != "not_found" &
603 !is.na(gr$trna_id)]
604 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" &
605 gr$TSD == "not_found" &
606 !is.na(gr$trna_id)]
607
608 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE)) 690 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
609 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class))) 691 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
692 rank_table <- list()
693 for (i in 1:length(Ranks)) {
694 gr_part <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
695 rank_table[[Ranks[i]]] <- as.integer(table(factor(gr_part$Final_Classification, levels = all_class)))
696
697 }
698 out <- cbind(do.call(cbind, rank_table), RT_domain=RT_domain)
699 out <- rbind(out, Total = colSums(out))
700 rownames(out) <- c(all_class, "Total")
701 return(out)
702 }
703
704
705 get_te_statistics_old <- function(gr, RT) {
706 DOMAINS <- gr[gr$type == "transposable_element" & gr$Rank == "D"]
707 DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$Rank == "DL"]
708 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & gr$Rank == "DLT"]
709 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLTP"]
710 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLP"]
711
712 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
713 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
714 D <- as.integer(table(factor(DOMAINS$Final_Classification, levels = all_class)))
610 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class))) 715 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class)))
611 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class))) 716 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class)))
612 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class))) 717 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class)))
613 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class))) 718 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class)))
614 out <- data.frame(RT_domain = RT_domain, 719 out <- data.frame(RT_domain = RT_domain,
720 DOMAINS = D,
615 DOMAINS_LTR = DL, 721 DOMAINS_LTR = DL,
616 DOMAINS_LTR_TSD = DLT, 722 DOMAINS_LTR_TSD = DLT,
617 DOMAINS_LTR_PBS = DLP, 723 DOMAINS_LTR_PBS = DLP,
618 DOMAINS_LTR_TSD_PBS = DLTP, 724 DOMAINS_LTR_TSD_PBS = DLTP,
619 row.names = all_class 725 row.names = all_class
642 B <- gr_te$Final_Classification 748 B <- gr_te$Final_Classification
643 names(ID) <- paste0(A, "#", B) 749 names(ID) <- paste0(A, "#", B)
644 750
645 } 751 }
646 752
647 get_te_sequences <- function(gr, s) { 753
648 # return list of biostrings 754 get_te_sequences <- function (gr, s) {
649 DOMAINS_LTR <- gr[gr$type == "transposable_element" & 755 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
650 gr$TSD == "not_found" & 756 s_te <- list()
651 is.na(gr$trna_id)] 757 for (i in 1:length(Ranks)) {
652 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & 758 gr_te <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
653 gr$TSD != "not_found" & 759 if (length(gr_te) > 0) {
654 is.na(gr$trna_id)] 760 s_te[[Ranks[i]]] <- getSeqNamed(s, gr_te)
655 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & 761 }
656 gr$TSD != "not_found" & 762 }
657 !is.na(gr$trna_id)] 763 return(s_te)
658 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & 764 }
659 gr$TSD == "not_found" & 765
660 !is.na(gr$trna_id)]
661 s_DL <- getSeqNamed(s, DOMAINS_LTR)
662 s_DLT <- getSeqNamed(s, DOMAINS_LTR_TSD)
663 s_DLP <- getSeqNamed(s, DOMAINS_LTR_PBS)
664 s_DLTP <- getSeqNamed(s, DOMAINS_LTR_TSD_PBS)
665 return(DNAStringSetList(
666 DL = s_DL,
667 DLT = s_DLT,
668 DLP = s_DLP,
669 DLTP = s_DLTP
670 ))
671
672 }
673 766
674 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){ 767 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){
675 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative 768 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative
676 # input sequences must be in the same orientation!!! 769 # input sequences must be in the same orientation!!!
677 sfile <- tempfile() 770 sfile <- tempfile()
708 is_representative = is_representative[ord] 801 is_representative = is_representative[ord]
709 ) 802 )
710 return(cls_info) 803 return(cls_info)
711 } 804 }
712 805
806
807 dante2dist <- function(dc){
808 # dc - cluster of domains, granges
809 dpos <- get_dom_pos(dc)
810 D <- c(dist(dpos))
811 NN <- combn(names(dpos), 2)
812 # order feature in pair alphabeticaly
813 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
814 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
815 dfd <- data.frame(F1 = N1, F2 = N2, distance = D)
816 }
817
818
819
820 dante_to_quantiles <- function(dc, model_function, lineage=NULL){
821 dfd <- dante2dist(dc)
822 if (is.null(lineage)){
823 lineage <- dc$Final_Classification[1]
824 }
825 # check if lineage has defined ecdf
826 if (lineage %in% names(model_function$plus)){
827 fn <- model_function$plus[[lineage]]
828 fnm <- model_function$minus[[lineage]]
829 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
830 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
831 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
832 return(dout)
833 }else{
834 return(NULL)
835 }
836 }
837
838 get_dom_pos <- function(g){
839 if (length(g) == 0){
840 return(NULL)
841 }
842 gdf <- data.frame(rexdb = as.character(seqnames(g)),
843 domain = g$Name,
844 S = start(g),
845 E = end(g),
846 stringsAsFactors = FALSE)
847 gSmat <- xtabs(S ~ rexdb + domain, data = gdf)
848 colnames(gSmat) <- paste0(colnames(gSmat),"_S")
849 gEmat <- xtabs(E ~ rexdb + domain, data = gdf)
850 colnames(gEmat) <- paste0(colnames(gEmat),"_E")
851 SEmat <- cbind(gSmat,gEmat)
852 # reorder
853 dom_position <- colMeans(SEmat)
854 SEmat_sorted <- SEmat[,order(dom_position)]
855 return(SEmat_sorted)
856 }