Mercurial > repos > metaboflow_cam > ionflow
view ionflow/ionflow_funcs.R @ 0:3b461dc9568b draft default tip
Uploaded
author | metaboflow_cam |
---|---|
date | Mon, 09 Aug 2021 09:41:22 +0000 |
parents | |
children |
line wrap: on
line source
#' ======================================================================= #' PreProcessing <- function(data = NULL, var_id = 1, batch_id = 3, data_id = 5, method_norm = c("median", "median+std", "none"), control_lines = NULL, control_use = c("control", "all", "control.out"), method_outliers = c("mad", "IQR", "log.FC.dist", "none"), thres_outl = 3, stand_method = c("std", "mad", "custom"), stdev = NULL, thres_symb = 2) { #' -------------------> Import data #' ji: get sample id data$sample_id <- rownames(data) data <- data[, c(var_id, ncol(data), batch_id, data_id:(ncol(data) - 1))] names(data)[1:3] <- c("Line", "Sample_ID", "Batch_ID") mat <- data[, -c(1:3)] #' ji: Remove samples with zeros or negative or missing values data <- data[!is.na(sum(mat)), ] data <- data[!(sum(mat <= 0) > 0), ] #' get summary stats res <- as.data.frame(t(sapply(mat, function(x) { c(round(summary(x), 3), round(var(x), 3)) }))) names(res)[ncol(res)] <- "Variance" res <- cbind(Ion = names(mat), res) rownames(res) <- NULL df_raw <- res #' data long format data_long <- reshape2::melt(data, id = c("Line", "Sample_ID", "Batch_ID"), variable.name = "Ion", value.name = "Concentration" ) #' convert to factors before using levels function. data_long$Line <- factor(data_long$Line) data_long$Ion <- factor(data_long$Ion) ion_name <- levels(data_long$Ion) #' -------------------> Median batch correction data_long$control <- rep(1, length(data_long$Concentration)) data_long$log <- log(data_long$Concentration) #' ji: control use in batch correction if (length(control_lines) > 0) { if (control_use == "all") { } if (control_use == "control") { data_long$control[!(data_long$Line %in% control_lines)] <- 0 } if (control_use == "control.out") { data_long$control[data_long$Line %in% control_lines] <- 0 } } #' batch correction methods if (method_norm == "median") { data_long <- plyr::ddply(data_long, "Ion", function(x) { res <- plyr::ddply(x, "Batch_ID", function(y) { med <- median(y$log[y$control == 1]) y$log_corr <- y$log - med y$med <- med y }) }) } if (method_norm == "median+std") { data_long <- plyr::ddply(data_long, "Ion", function(x) { res <- plyr::ddply(x, "Batch_ID", function(y) { med <- median(y$log[y$control == 1]) st_de <- sd(y$log[y$control == 1]) y$log_corr <- y$log - med - st_de y$med <- med y }) }) } if (method_norm == "none") { data_long$log_corr <- data_long$log data_long$med <- NULL } #' get correction stats res <- plyr::ddply(data_long, "Ion", function(x) { c(round(summary(x$log_corr), 3), round(var(x$log_corr), 3)) }) names(res)[ncol(res)] <- "Variance" df_bat <- res #' -------------------> Outlier detection #' ji: outlier detection methods if (method_outliers == "IQR") { data_long <- plyr::ddply(data_long, "Ion", function(x) { res <- plyr::ddply(x, "Batch_ID", function(y) { lowerq <- quantile(y$log_corr, na.rm = T)[2] upperq <- quantile(y$log_corr, na.rm = T)[4] iqr <- upperq - lowerq extreme.t.upper <- (iqr * thres_outl) + upperq extreme.t.lower <- lowerq - (iqr * thres_outl) y$Outlier <- ifelse((y$log_corr > extreme.t.upper) | (y$log_corr < extreme.t.lower), 1, 0) return(y) }) }) } if (method_outliers == "mad") { data_long <- plyr::ddply(data_long, "Ion", function(x) { res <- plyr::ddply(x, "Batch_ID", function(y) { med_dev <- mad(y$log_corr) extreme.t.upper <- (med_dev * thres_outl) extreme.t.lower <- -(med_dev * thres_outl) y$Outlier <- ifelse((y$log_corr > extreme.t.upper) | (y$log_corr < extreme.t.lower), 1, 0) return(y) }) }) } if (method_outliers == "log.FC.dist") { data_long <- plyr::ddply(data_long, "Ion", function(x) { res <- plyr::ddply(x, "Batch_ID", function(y) { extreme.t.upper <- thres_outl extreme.t.lower <- -thres_outl y$Outlier <- ifelse((y$log_corr > extreme.t.upper) | (y$log_corr < extreme.t.lower), 1, 0) return(y) }) }) } if (method_outliers == "none") { data_long$Outlier <- rep(0, length(data_long$Line)) df_outlier <- data.frame() } else { df_outlier <- data.frame(cbind( levels(data_long$Ion), table(data_long$Ion, data_long$Outlier), round(table(data_long$Ion, data_long$Outlier)[, 2] / dim(data_long)[1] * 100, 2) )) rownames(df_outlier) <- c() colnames(df_outlier) <- c("Ion", "no_outlier", "outlier", "outlier(%)") } #' ji: remove samples with at least an outlier value samples_to_exclude <- unique(data_long$Sample_ID[data_long$Outlier == 1]) data_long <- data_long[!(data_long$Sample_ID %in% samples_to_exclude), ] #' -------------------> Standardisation #' ji: standardisation methods if (stand_method == "std") { sds <- plyr::ddply(data_long, "Ion", function(x) sd(x$log_corr)) nam <- sds[, 1] sds <- as.numeric(as.vector(sds[, 2])) names(sds) <- nam } if (stand_method == "mad") { sds <- plyr::ddply(data_long, "Ion", function(x) mad(x$log_corr)) nam <- sds[, 1] sds <- as.numeric(as.vector(sds[, 2])) names(sds) <- nam } if (stand_method == "custom") { # specific 2-columns format for vector of sd sds <- stdev nam <- sds[, 1] sds <- as.numeric(as.vector(sds[, 2])) names(sds) <- nam } #' ji: aggregate measurements at line level (median) data_wide_line_log_norm <- reshape2::dcast(data_long, Line ~ Ion, fun.aggregate = median, value.var = "log_corr" ) #' ji: normalise by stds data_wide_line_z_score <- data_wide_line_log_norm data_wide_line_z_score[, 2:ncol(data_wide_line_z_score)] <- data_wide_line_z_score[, 2:ncol(data_wide_line_z_score)] / sds #' -------------------> Symbolisation symb_profiles <- data_wide_line_z_score[, 2:ncol(data_wide_line_z_score)] symb_profiles[(symb_profiles > -thres_symb) & (symb_profiles < thres_symb)] <- 0 symb_profiles[symb_profiles >= thres_symb] <- 1 symb_profiles[symb_profiles <= -thres_symb] <- -1 #' ji: save sybolic profiles data_wide_line_symb <- cbind(Line = data_wide_line_z_score$Line, symb_profiles) #' plot z-score distributions p1 <- ggplot( data = data_long, aes(x = factor(Batch_ID), y = log_corr, col = factor(Batch_ID)) ) + geom_point(shape = 1) + facet_wrap(~Ion) + xlab("Batch.ID") + ylab("Log FC to batch median") + theme(legend.position = "none") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) # plot overview processed samples dat <- reshape2::melt(data_wide_line_z_score, id = "Line") p2 <- ggplot(data = dat, aes(x = value)) + geom_histogram(binwidth = .1) + facet_wrap(~variable) + xlab("Concentration (z-score)") + ylab("Counts") + xlim(-10, 10) + geom_vline(xintercept = c(-thres_symb, thres_symb), col = "red") # Histogram Number of Elements Changed ions.changed <- rowSums(abs(data_wide_line_symb[,2:ncol(data_wide_line_symb)])) p3 <- ggplot(data=data.frame(table(ions.changed)), aes(x=ions.changed, y=Freq)) + geom_bar(stat="identity") + xlab("Number of Elements Changed") + ylab("Number of Mutants"); # Histogram number of changes per element ions.up <- colSums((data_wide_line_symb[,2:ncol(data_wide_line_symb)])==1) ions.down <- colSums((data_wide_line_symb[,2:ncol(data_wide_line_symb)])==-1) ions.direction <- data.frame(hist.name = names(c(ions.down, ions.up)), hist.val = c(ions.down, ions.up), hist.type = c(rep("down",length(ions.down)), rep("up",length(ions.up))) ) p4 <- ggplot(data=ions.direction, aes(fill=hist.type, x=hist.name, y=hist.val)) + geom_bar(position="stack", stat="identity") + scale_fill_manual(values=c("down"="blue","up"="red")) + theme(legend.title = element_blank()) + ylab("Number of Mutants") + xlab(""); if (method_norm != "none"){ # plot batch median log-concentrations plot.data.long <- data_long[data_long["Outlier"]==0, ] batch_median_ions <- plot.data.long[!duplicated(plot.data.long[,"med"]), c("Batch_ID","Ion","med"), drop = FALSE]; ion.batch.mean.median <- batch_median_ions %>% dplyr::group_by(Ion) %>% dplyr::summarize(mean = mean(med, na.rm = TRUE)) ion.order <- as.character(ion.batch.mean.median$Ion[sort.list(ion.batch.mean.median$mean)], decreasing = TRUE) batch_median_ions$Ion <- factor(batch_median_ions$Ion, levels = ion.order) p5 <- ggplot(batch_median_ions, aes(x=Ion, y=med, group = Batch_ID)) + geom_line(aes(), color = "gray") + theme(text = element_text(size=20), axis.title.x=element_blank()) + ylab("Log-concentration") + stat_summary(aes(y = med, group=1), fun = mean, colour="green1", geom="line", group=1) + stat_summary(fun.min = function(x) mean(x) - sd(x), fun.max = function(x) mean(x) + sd(x), position ='dodge', colour="green1", geom = 'errorbar', aes(group = 1), width=.1)+ ggtitle("Median log-concentrations of batches"); # plot CV batch median log-concentrations ion.batch.cv.median <- batch_median_ions %>% dplyr::group_by(Ion) %>% dplyr::summarize(CV = sd(med, na.rm = TRUE)/mean(med, na.rm = TRUE)) ion.batch.cv.median <- data.frame(ion.batch.cv.median) ion.batch.cv.median <- ion.batch.cv.median[order(ion.batch.cv.median$CV),] ion.batch.cv.median$Ion <- factor(ion.batch.cv.median$Ion, levels = ion.batch.cv.median$Ion) p6 <- ggplot(ion.batch.cv.median, aes(x=Ion, y=abs(CV), group =1)) + ylab(" Absolute CV") + geom_line(aes(), color = "red") + theme(text = element_text(size=20), axis.title.x=element_blank()) + scale_y_log10() + ggtitle("CV median log-concentrations across batches"); }else{ p5 <- NULL p6 <- NULL } #' -------------------> Output res <- list() res$stats.raw.data <- df_raw # raw data res$stats.outliers <- df_outlier # outliers res$stats.batches <- df_bat # batch corrected data res$stats.std <- sds # standard deviations res$data.long <- data_long # with Batch_ID res$data.line.logFC <- data_wide_line_log_norm res$data.line.zscores <- data_wide_line_z_score res$data.line.symb <- data_wide_line_symb res$plot.overview <- p1 res$plot.hist <- p2 res$plot.change.stat <- p3 res$plot.change.dir <- p4 res$plot.medians <- p5 res$plot.CV <- p6 return(res) } #' ======================================================================= #' IonAnalysis <- function(data = NULL, thres_ion_corr = 0.15, method_ion_corr = "pearson"){ #' -------------------> PCA ionProfile.PCA = NULL ionProfile.PCA$pr.y = prcomp(data[, -1],scale. = F) ionProfile.PCA$y = cbind(data.frame(Line = data$Line), as.data.frame(ionProfile.PCA$pr.y$x)) unit.norm = function(x)(x / sqrt(sum(x^2))) ionProfile.PCA$y[,grepl("^PC",colnames(ionProfile.PCA$y))] = apply(ionProfile.PCA$y[,grepl("^PC",colnames(ionProfile.PCA$y))],2,unit.norm) p.PCA.data = ionProfile.PCA$y %>% dplyr::select(Line, PC1, PC2 ) my.annotation = tbl_df(ionProfile.PCA$pr.y$rotation) %>% dplyr::select(PC1, PC2) %>% dplyr::mutate(Ion = row.names(ionProfile.PCA$pr.y$rotation)) %>% `colnames<-`(c("x", "y", "Ion")) p_pca <- p.PCA.data %>% ggplot(aes(x = PC1, y = PC2)) + geom_segment(inherit.aes = F, data = my.annotation, aes(x = 0, y = 0, xend = x/2, yend = y/2), color = "blue", arrow = arrow(length = unit(0.02, "npc")))+ geom_text(inherit.aes = F, data = my.annotation, aes(x = 0.51*x, y = 0.51*y, label = Ion), color = "red", size = 4) + geom_point(size = 1, alpha = 1/20) + theme(aspect.ratio = 1) + scale_shape(solid = FALSE) + labs(x = paste0("PC1", " (",round(summary(ionProfile.PCA$pr.y)$importance[2,"PC1"],2)*100,"%)"), y = paste0("PC2"," (", round(summary(ionProfile.PCA$pr.y)$importance[2,"PC2"],2)*100,"%)"), color = "changed") #' PCA loading pca_loadings <- data.frame(ionProfile.PCA$pr.y$rotation) rownames(pca_loadings) <- colnames(data[, -1]) pca_loadings <- pca_loadings[, 1:2] #' -------------------> Correlation col3 <- colorRampPalette(c("steelblue4", "white", "firebrick")) corrplot.mixed(cor(data[, -1], use = "complete.obs", method = method_ion_corr), number.cex = .7, lower.col = "black", upper.col = col3(100) ) p_corr <- recordPlot() #' -------------------> Heatmap with dendrogram pheatmap(data[, -1], show_rownames = F, cluster_cols = T, cluster_rows = T, legend = T, fontsize = 15, clustering_method = "ward.D", scale = "row") p_heat <- recordPlot() #' -------------------> Correlation network #' wl-13-08-2020, Thu: there is no 'qgraph' in conda forge and bio conda. if (T) { #' library(glasso) #' corr_reg <- glasso(corr, rho = 0.01) #' net <- network::network(corr_reg$w, directed = FALSE) corr <- cor(na.omit(data[, -1]), method = method_ion_corr) corr[abs(corr) < thres_ion_corr] <- 0 net <- network::network(corr[], directed = FALSE) #' set edge attributes net %e% "weight" <- corr net %e% "weight_abs" <- abs(corr) * 6 net %e% "color" <- ifelse(net %e% "weight" > 0, "lightgreen", "coral") p_net <- ggnet2(net, label = TRUE, mode = "fruchtermanreingold", node.size = 10, edge.size = "weight_abs", edge.color = "color" ) } else { #' wl-06-07-2020, Mon: 'cor_auto' is from package qgraph(lavaan) #' wl-28-07-2020, Tue: cad and corr are the same cad <- cor_auto(data[, -1]) suppressWarnings(qgraph(cad, graph = "glasso", layout = "spring", tuning = 0.25, sampleSize = nrow(data[, -1]) )) Graph_lasso <- recordPlot() } res <- list() res$data.pca.load <- pca_loadings res$plot.pca <- p_pca res$plot.corr <- p_corr res$plot.net <- p_net res$plot.heat <- p_heat return(res) } #' ======================================================================= #' wl-04-10-2020, Sun: Hierarchical clustering #' ji-10-01-2020, Sun: Input, Method, Output simplified ProfileClustering <- function(symbol_profiles, min_clust_size = 10, h_tree = 0, filter_zero_string = TRUE) { if (filter_zero_string){ symbol_profiles <- symbol_profiles[rowSums(symbol_profiles[, -1])!=0, ] } x <- symbol_profiles[, -1] dis <- stats::dist(x, method = "manhattan") hc <- hclust(d = dis, method = "single") clus <- data.frame(Line = symbol_profiles$Line, Cluster = cutree(hc, h = h_tree)) tab <- as.data.frame(table(clus$Cluster), stringsAsFactors = F) names(tab) <- c("Cluster", "Number.of.genes") tab_subset <- tab[tab$Number.of.genes > min_clust_size, ] tab_subset <- tab_subset[order(tab_subset$Number.of.genes, decreasing = T), ] idx_subset <- clus %in% tab_subset$Cluster res <- list() res$clusters.vector <- clus res$tab.clusters <- tab res$tab.clusters.subset <- tab_subset return(res) } #' ======================================================================= #' wl-06-11-2020, Fri: Get ENTREZID from SYMBOL #' get_entrez_id <- function(gene_list, annot_pkg = "org.Sc.sgd.db", key_type = "ORF") { res <- AnnotationDbi::select(get(annot_pkg), keys = gene_list, columns = "ENTREZID", keytype = key_type) res <- res[,2,drop = T] res <- res[!is.na(res)] res <- res[!duplicated(res)] return(res) } #' ======================================================================= #' wl-03-10-2020, Sat: KEGG enrichment analysis for symbolization data #' wl-06-11-2020, Fri: The first column of data must be ORF for #' org.Sc.sgd.db or SYMBOL for any other annotation packages. #' ji-11-01-2020, Mon: Input, Method, Output simplified #' KeggEnricher <- function(cluster_vector, pval = 0.05, min_count = 3, annot_pkg = "org.Sc.sgd.db", gene_uni = NULL) { if(is.null(gene_uni)){ gene_uni <- as.character(cluster_vector$Line) } clusters_set <- unique(cluster_vector$Cluster) #' geneIds can be ORF or ENTREZID enrich <- lapply(clusters_set, function(x) { params <- new("KEGGHyperGParams", geneIds = gene_uni[cluster_vector$Cluster == x], universeGeneIds = gene_uni, annotation = annot_pkg, categoryName = "KEGG", pvalueCutoff = pval, testDirection = "over") over <- hyperGTest(params) }) # name list with original clusters ID names(enrich) <- clusters_set #' There is no explicit methods for getting manual summary table. summ <- lapply(enrich, function(x) { tmp <- summary(x) if (nrow(tmp) == 0) tmp <- NULL #' wl-04-10-2020, Sun: it happens very often. return(tmp) }) summ <- summ[!sapply(summ,is.null)] #' binding and filtering summ <- lapply(summ, "[", -c(3, 4)) %>% dplyr::bind_rows(.id = "Cluster") %>% dplyr::filter(Pvalue <= pval & Count >= min_count) res <- list() res$enrichment.summary <- summ res$enrichment.full.results <- enrich return(res) } #' ======================================================================= #' wl-03-10-2020, Sat: GO enrichment analysis for symbolization data #' wl-06-11-2020, Fri: The first column of data must be ORF for #' org.Sc.sgd.db or SYMBOL for any other annotation packages. #' GOEnricher <- function(cluster_vector, pval = 0.05, ont = c("BP","CC","MF"), min_count = 3, annot_pkg = "org.Sc.sgd.db", gene_uni = NULL) { if(is.null(gene_uni)){ gene_uni <- as.character(cluster_vector$Line) } ont <- match.arg(ont) genes_not_annotated <- NULL if ( annot_pkg == "org.Sc.sgd.db"){ genes_not_annotated <- unlist(mget( mappedkeys(org.Sc.sgdGO2ALLORFS)[which(is.na(Term(mappedkeys(org.Sc.sgdGO2ALLORFS))))], org.Sc.sgdGO2ALLORFS) ) gene_uni <- mappedkeys(org.Sc.sgdGO)[mappedkeys(org.Sc.sgdGO) %in% gene_uni] } if ( annot_pkg == "org.Hs.eg.db"){ genes_not_annotated <- unlist(mget( mappedkeys(org.Hs.egGO2ALLEGS)[which(is.na(Term(mappedkeys(org.Hs.egGO2ALLEGS))))], org.Hs.egGO2ALLEGS) ) #gene_uni <- mappedkeys(org.Hs.egGO) gene_uni <- mappedkeys(org.Hs.egGO)[mappedkeys(org.Hs.egGO) %in% gene_uni] } if ( annot_pkg == "org.Mm.eg.db"){ genes_not_annotated <- unlist(mget( mappedkeys(org.Mm.egGO2ALLEGS)[which(is.na(Term(mappedkeys(org.Mm.egGO2ALLEGS))))], org.Mm.egGO2ALLEGS) ) gene_uni <- mappedkeys(org.Mm.egGO)[mappedkeys(org.Mm.egGO) %in% cluster_vector$Line] } gene_uni <- as.character(gene_uni[!gene_uni %in% genes_not_annotated]) cluster_vector <- cluster_vector[cluster_vector$Line %in% gene_uni,] clusters_set <- unique(cluster_vector$Cluster) #' geneIds can be ORF or ENTREZID enrich <- lapply(clusters_set, function(x) { gene_set <- as.character(cluster_vector$Line[cluster_vector$Cluster == x]) params <- new("GOHyperGParams", geneIds = gene_set, universeGeneIds = gene_uni, annotation = annot_pkg, categoryName = "GO", ontology = ont, pvalueCutoff = pval, conditional = T, testDirection = "over") over <- hyperGTest(params) }) # name list with original clusters ID names(enrich) <- clusters_set summ <- lapply(enrich, function(x) { Pvalue <- round(pvalues(x), digit = 4) ID <- names(Pvalue) Description <- Term(ID) OddsRatio <- oddsRatios(x) ExpCount <- expectedCounts(x) Count <- geneCounts(x) CountUniverse <- universeCounts(x) tab <- cbind(ID, Description, Pvalue, OddsRatio, ExpCount, Count, CountUniverse) rownames(tab) <- NULL tab <- na.omit(tab) #' wl-03-10-2020, Sat: this is why summary fails. tab <- data.frame(tab, Ontology = ont) }) summ <- lapply(summ, "[", -c(4, 5)) %>% dplyr::bind_rows(.id = "Cluster") %>% dplyr::filter(Pvalue <= pval & Count >= min_count) res <- list() res$enrichment.summary <- summ res$enrichment.full.results <- enrich return(res) } GeneticNetwork <- function(data = NULL, method_corr = c("pearson", "spearman", "kendall", "cosine", "mahal_cosine", "hybrid_mahal_cosine"), network_modules = c("louvain", "input"), thres_corr = 0.7, cluster_vector = NULL, cluster_label_vector = NULL, n_labels = 3 ) { mat <- as.matrix(data[,!colnames(data)%in%c("Line","Cluster")]) #' Compute similarity matrix if (method_corr == "pearson" || method_corr == "spearman" || method_corr == "kendall") { corrGenes <- cor(t(as.matrix(mat)), method = method_corr, use = "pairwise.complete.obs") } else if (method_corr == "cosine") { corrGenes <- cosine(t(as.matrix(mat))) } else if (method_corr == "mahal_cosine") { corrGenes <- cosM(mat, mode = "normal") } else if (method_corr == "hybrid_mahal_cosine") { corrGenes <- cosM(mat, mode = "hybrid") } #' Adjacency matrix A <- corrGenes diag(A) <- 0 A <- (A > thres_corr) A <- ifelse(A == TRUE, 1, 0) if (network_modules == "input"){ if (length(cluster_label_vector) == 0){ cluster_vector <- as.character(cluster_vector$Cluster) }else{ mm <- match(cluster_vector$Cluster, cluster_label_vector$Cluster) cluster_vector <- cluster_label_vector$label[mm] } tmp <- unique(cluster_vector) if (length(tmp) != 1) { cpy <- rainbow(length(tmp)) names(cpy) <- tmp } else { cpy <- "Set2" } } if (network_modules == "louvain"){ # community detection tmp <- igraph::graph_from_adjacency_matrix(A, mode="undirected") com <- igraph::cluster_louvain(tmp, weights = NULL) cluster_vector <- as.character(igraph::membership(com)) degree_vector <- igraph::degree(tmp) tmp <- unique(cluster_vector) if (length(tmp) != 1) { cpy <- rainbow(length(tmp)) names(cpy) <- tmp } else { cpy <- "Set2" } } #' Generate network net <- network::network(A, directed = FALSE) net %v% "Label" <- cluster_vector #' Remove communities of size 1 if (network_modules == "louvain"){ net <- delete.vertices(net, which(degree_vector == 0)); } net_p <- GGally::ggnet2(net, mode = "fruchtermanreingold", color = "Label", palette = cpy, edge.alpha = 0.5, size = 2, color.legend = "Modules", legend.size = 10, legend.position = "right" ) #' network edge list net <- as.matrix(net, matrix.type = "edgelist")[,1:2]; net[,] <- cbind(as.character(data$Line[net[,1]]), as.character(data$Line[net[,2]])) #' Impact and betweenness btw <- sna::betweenness(A) # or use 'net' instead of 'A' impact <- apply(mat, 1, norm, type = "2") # L2 norm df.res <- data.frame( Line = data$Line, impact = round(impact, 3), betweenness = round(btw, 3), log.betweenness = round(log(btw + 1), 3), pos = factor(ifelse((impact < quantile(impact, .75)) & (log(btw + 1) < quantile(log(btw + 1), .75)), 1, ifelse((impact < quantile(impact, .75)) & (log(btw + 1) > quantile(log(btw + 1), .75)), 2, ifelse((impact > quantile(impact, .75)) & (log(btw + 1) < quantile(log(btw + 1), .75)), 3, 4) ) )), pos.label = factor(ifelse((impact < quantile(impact, .75)) & (log(btw + 1) < quantile(log(btw + 1), .75)), "Low impact, low betweenness", ifelse((impact < quantile(impact, .75)) & (log(btw + 1) > quantile(log(btw + 1), .75)), "Low impact, high betweenness", ifelse((impact > quantile(impact, .75)) & (log(btw + 1) < quantile(log(btw + 1), .75)), "High impact, low betweenness", "High impact, high betweenness") ) )) ) rownames(df.res) <- data$Line[] q1 <- row.names(subset(df.res, (impact < quantile(impact, .75)) & (log.betweenness < quantile(log.betweenness, .75)))) q2 <- row.names(subset(df.res, (impact < quantile(impact, .75)) & (log.betweenness > quantile(log.betweenness, .75)))) q3 <- row.names(subset(df.res, (impact > quantile(impact, .75)) & (log.betweenness < quantile(log.betweenness, .75)))) q4 <- row.names(subset(df.res, (impact > quantile(impact, .75)) & (log.betweenness > quantile(log.betweenness, .75)))) #' labels N <- n_labels lst <- list(q1, q2, q3, q4) idx <- lapply(lst, function(x) { if (length(x) > N){ tmp <- df.res[x, c("betweenness","impact")]; union(rownames(tmp)[order(tmp$betweenness, decreasing = TRUE)[1:N]], rownames(tmp)[order(tmp$impact, decreasing = TRUE)[1:N]]) }else{x} }) idx <- unique(unlist(idx)) df.idx <- df.res[idx, ] im_be_p <- ggplot(data = df.res, aes(x = impact, y = log.betweenness)) + geom_point(aes(col = pos.label), alpha = .3, size = 3) + scale_color_manual(values = c( "plum4", "palegreen4", "indianred", "cornflowerblue" )) + theme_linedraw() + theme_light() + theme(legend.position = "bottom") + guides(colour = guide_legend(nrow = 2)) + theme(legend.title = element_blank()) + geom_text_repel(data = df.idx, aes(label = Line), size = 3.5) + geom_vline(xintercept = quantile(df.res$impact, .75), linetype = "dashed") + geom_hline( yintercept = quantile(df.res$log.betweenness, .75), linetype = "dashed" ) + xlab("Impact") + ylab("Log(betweenness+1)") res <- list() res$network <- net res$network.modules <- cluster_vector res$plot.network <- net_p # plot gene network with symbolic pheno res$plot.impact_betweenness <- im_be_p # plot impact betweenees res$stats.impact_betweenness <- df.res # impact and betweenees data return(res) } cosM <- function(x, mode = c("normal", "hybrid")) { #' -------------------------------------------------------------------- #' library("pracma") #' pinv: Pseudoinverse (Moore-Penrose Generalized Inverse) pinv <- function (A, tol = .Machine$double.eps^(2/3)) { stopifnot(is.numeric(A), length(dim(A)) == 2, is.matrix(A)) s <- svd(A) p <- (s$d > max(tol * s$d[1], 0)) if (all(p)) { mp <- s$v %*% (1/s$d * t(s$u)) } else if (any(p)) { mp <- s$v[, p, drop=FALSE] %*% (1/s$d[p] * t(s$u[, p, drop=FALSE])) } else { mp <- matrix(0, nrow=ncol(A), ncol=nrow(A)) } return(mp) } mldivide <- function(A, B, pinv = TRUE) { stopifnot( is.numeric(A) || is.complex(A), is.numeric(B) || is.complex(B)) if (is.vector(A)) A <- as.matrix(A) if (is.vector(B)) B <- as.matrix(B) if (nrow(A) != nrow(B)) { stop("Matrices 'A' and 'B' must have the same number of rows.") } if (pinv) { pinv(t(A) %*% A) %*% t(A) %*% B } else { qr.solve(A, B) } } mrdivide <- function(A, B, pinv = TRUE) { stopifnot( is.numeric(A) || is.complex(A), is.numeric(B) || is.complex(B)) if (is.vector(A)) A <- t(A) if (is.vector(B)) B <- t(B) if (ncol(A) != ncol(B)) { stop("Matrices 'A' and 'B' must have the same number of columns.") } t(mldivide(t(B), t(A), pinv = pinv)) } #' -------------------------------------------------------------------- #' compute covariance Cov <- cov(x, use = "pairwise.complete.obs") n <- dim(x)[1] m <- dim(x)[2] #' compute eigenvalues Eig <- eigen(Cov) score <- mrdivide(as.matrix(x), t(Eig$vectors)) #' wl-25-10-2020, Sun: n must be larger than m othwise this script fails. if (any(Eig$values < 0)) { stop("Cov Not Positive SemiDefinite !") } #' compute pairwise cosine similarity C <- rep(0, n * (n - 1) / 2) d <- 0 for (i in 1:(n - 1)) { for (j in (i + 1):n) { d <- d + 1 #' inner product and norms inner <- 0 normx1 <- 0 normy1 <- 0 for (l in 1:m) { if (mode == "normal") { inner <- inner + score[i, l] * score[j, l] / Eig$values[l] } else if (mode == "hybrid") { inner <- inner + score[i, l] * score[j, l] } normx1 <- normx1 + (score[i, l] * score[i, l] / Eig$values[l]) normy1 <- normy1 + (score[j, l] * score[j, l] / Eig$values[l]) } C[d] <- inner / (sqrt(normx1) * sqrt(normy1)) } } #' wl-25-10-2020, Sun: convert to symmetric matrix if (T) { #' wl's implementation #' mat <- matrix(1, n, n) #' mat[lower.tri(mat, diag = F)] <- C #' mat[which(lower.tri(t(mat)), arr.ind = T)[, c(2,1)]] <- C #' ji's implemetation mat <- matrix(0, n, n) mat[lower.tri(mat, diag = F)] <- C mat <- mat + t(mat) diag(mat) <- 1 dimnames(mat) <- list(rownames(x), rownames(x)) return(mat) } else { return(C) } } #' #' ======================================================================= #' From R package "lsa" #' x <- iris[, -5] #' cosine(as.matrix(x)) #' cosine(as.matrix(t(x))) #' cosine <- function(x, y = NULL) { if (is.matrix(x) && is.null(y)) { co <- array(0, c(ncol(x), ncol(x))) f <- colnames(x) dimnames(co) <- list(f, f) for (i in 2:ncol(x)) { for (j in 1:(i - 1)) { co[i, j] <- cosine(x[, i], x[, j]) } } co <- co + t(co) diag(co) <- 1 return(as.matrix(co)) } else if (is.vector(x) && is.vector(y)) { return(crossprod(x, y) / sqrt(crossprod(x) * crossprod(y))) } else if (is.vector(x) && is.matrix(y)) { co <- vector(mode = "numeric", length = ncol(y)) names(co) <- colnames(y) for (i in 1:ncol(y)) { co[i] <- cosine(x, y[, i]) } return(co) } else { stop("argument mismatch. Either one matrix or two vectors needed as input.") } }