comparison scripts/estimateprops.R @ 3:fd7a16d073c5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 20f8561478535013e111d982b99639f48f1bea79"
author bgruening
date Sat, 29 Jan 2022 12:51:49 +0000
parents 3ca0132c182a
children 56371b5a2da9
comparison
equal deleted inserted replaced
2:1c4cf4b7debe 3:fd7a16d073c5
15 samples = samples_label, select.ct = celltypes, verbose = T) 15 samples = samples_label, select.ct = celltypes, verbose = T)
16 16
17 estimated_music_props <- est_prop$Est.prop.weighted 17 estimated_music_props <- est_prop$Est.prop.weighted
18 estimated_nnls_props <- est_prop$Est.prop.allgene 18 estimated_nnls_props <- est_prop$Est.prop.allgene
19 19
20 scale_yaxes <- function(gplot, value) {
21 if (is.na(value)) {
22 gplot
23 } else {
24 gplot + scale_y_continuous(lim = c(0, value))
25 }
26 }
27
20 ## Show different in estimation methods 28 ## Show different in estimation methods
21 ## Jitter plot of estimated cell type proportions 29 ## Jitter plot of estimated cell type proportions
22 jitter_fig <- Jitter_Est( 30 jitter_fig <- scale_yaxes(Jitter_Est(
23 list(data.matrix(estimated_music_props), 31 list(data.matrix(estimated_music_props),
24 data.matrix(estimated_nnls_props)), 32 data.matrix(estimated_nnls_props)),
25 method.name = methods, title = "Jitter plot of Est Proportions", 33 method.name = methods, title = "Jitter plot of Est Proportions",
26 size = 2, alpha = 0.7) + theme_minimal() 34 size = 2, alpha = 0.7) + theme_minimal(), maxyscale)
27 35
28 36
29 ## Make a Plot 37 ## Make a Plot
30 ## A more sophisticated jitter plot is provided as below. We separated 38 ## A more sophisticated jitter plot is provided as below. We separated
31 ## the T2D subjects and normal subjects by their disease factor levels. 39 ## the T2D subjects and normal subjects by their disease factor levels.
40 celltypes <- levels(m_prop$CellType) 48 celltypes <- levels(m_prop$CellType)
41 message("No celltypes declared, using:") 49 message("No celltypes declared, using:")
42 message(celltypes) 50 message(celltypes)
43 } 51 }
44 52
45 if (phenotype_target_threshold == -99) {
46 phenotype_target_threshold <- -Inf
47 message("phenotype target threshold set to -Inf")
48 }
49
50 if (is.null(phenotype_factors)) { 53 if (is.null(phenotype_factors)) {
51 phenotype_factors <- colnames(pData(bulk_eset)) 54 phenotype_factors <- colnames(pData(bulk_eset))
52 } 55 }
53 ## filter out unwanted factors like "sampleID" and "subjectName" 56 ## filter out unwanted factors like "sampleID" and "subjectName"
54 phenotype_factors <- phenotype_factors[ 57 phenotype_factors <- phenotype_factors[
55 !(phenotype_factors %in% phenotype_factors_always_exclude)] 58 !(phenotype_factors %in% phenotype_factors_always_exclude)]
56 message("Phenotype Factors to use:") 59 message("Phenotype Factors to use:")
57 message(phenotype_factors) 60 message(paste0(phenotype_factors, collapse = ", "))
58
59 61
60 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint 62 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint
61 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint 63 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint
62 levels = methods) 64 levels = methods)
63 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint 65
64 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] 66 if (use_disease_factor) {
65 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 67
66 sample_groups <- c("Normal", sample_disease_group) 68 if (phenotype_target_threshold == -99) {
67 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint 69 phenotype_target_threshold <- -Inf
68 levels = sample_groups) 70 message("phenotype target threshold set to -Inf")
69 71 }
70 ## Binary to scale: e.g. TRUE / 5 = 0.2 72
71 m_prop$D <- (m_prop$Disease == # nolint 73 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint
72 sample_disease_group) / sample_disease_group_scale 74 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]
73 ## NA's are not included in the comparison below 75 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2
74 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), 76 sample_groups <- c("Normal", sample_disease_group)
75 subset(m_prop, Disease == sample_disease_group)) 77 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint
76 78 levels = sample_groups)
77 jitter_new <- ggplot(m_prop, aes(Method, Prop)) + 79
78 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), 80 ## Binary to scale: e.g. TRUE / 5 = 0.2
79 size = 2, alpha = 0.7, 81 m_prop$D <- (m_prop$Disease == # nolint
80 position = position_jitter(width = 0.25, height = 0)) +
81 facet_wrap(~ CellType, scales = "free") +
82 scale_colour_manual(values = c("white", "gray20")) +
83 scale_shape_manual(values = c(21, 24)) + theme_minimal()
84
85 ## Plot to compare method effectiveness
86 ## Create dataframe for beta cell proportions and Disease_factor levels
87 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint
88 phenotype_factors],
89 ct.prop = c(estimated_music_props[, 2],
90 estimated_nnls_props[, 2]),
91 Method = factor(rep(methods,
92 each = nrow(estimated_music_props)),
93 levels = methods))
94 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint
95 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target]))
96 m_prop_ana$Disease <- factor(sample_groups[( # nolint
97 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1],
98 sample_groups)
99 m_prop_ana$D <- (m_prop_ana$Disease == # nolint
100 sample_disease_group) / sample_disease_group_scale 82 sample_disease_group) / sample_disease_group_scale
101 83 ## NA's are not included in the comparison below
102 jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + 84 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),
103 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + 85 subset(m_prop, Disease == sample_disease_group))
104 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), 86
105 size = 2, alpha = 0.7) + facet_wrap(~ Method) + 87 jitter_new <- scale_yaxes(ggplot(m_prop, aes(Method, Prop)) +
106 ggtitle(compare_title) + theme_minimal() + 88 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
107 scale_colour_manual(values = c("white", "gray20")) + 89 size = 2, alpha = 0.7,
108 scale_shape_manual(values = c(21, 24)) 90 position = position_jitter(width = 0.25, height = 0)) +
91 facet_wrap(~ CellType, scales = "free") +
92 scale_colour_manual(values = c("white", "gray20")) +
93 scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale)
94
95 }
96
97 if (use_disease_factor) {
98
99 ## Plot to compare method effectiveness
100 ## Create dataframe for beta cell proportions and Disease_factor levels
101 ## - Ugly code. Essentially, doubles the cell type proportions for each
102 ## set of MuSiC and NNLS methods
103 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint
104 phenotype_factors],
105 ## get proportions of target cell type
106 ct.prop = c(estimated_music_props[, phenotype_scrna_target],
107 estimated_nnls_props[, phenotype_scrna_target]),
108 ##
109 Method = factor(rep(methods,
110 each = nrow(estimated_music_props)),
111 levels = methods))
112 ## - fix headers
113 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint
114 ## - drop NA for target phenotype (e.g. hba1c)
115 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target]))
116 m_prop_ana$Disease <- factor( # nolint
117 ## - Here we set Normal/Disease assignments across the two MuSiC and NNLS methods
118 sample_groups[(
119 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1
120 ],
121 sample_groups)
122 ## - Then we scale this binary assignment to a plotable factor
123 m_prop_ana$D <- (m_prop_ana$Disease == # nolint
124 sample_disease_group) / sample_disease_group_scale
125
126 jitt_compare <- scale_yaxes(ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) +
127 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) +
128 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
129 size = 2, alpha = 0.7) + facet_wrap(~ Method) +
130 ggtitle(paste0(toupper(phenotype_target), " vs. ",
131 toupper(phenotype_scrna_target), " Cell Type Proportion")) +
132 theme_minimal() +
133 ylab(paste0("Proportion of ",
134 phenotype_scrna_target, " cells")) +
135 xlab(paste0("Level of bulk factor (", phenotype_target, ")")) +
136 scale_colour_manual(values = c("white", "gray20")) +
137 scale_shape_manual(values = c(21, 24)), maxyscale)
138 }
109 139
110 ## BoxPlot 140 ## BoxPlot
111 plot_box <- Boxplot_Est(list( 141 plot_box <- scale_yaxes(Boxplot_Est(list(
112 data.matrix(estimated_music_props), 142 data.matrix(estimated_music_props),
113 data.matrix(estimated_nnls_props)), 143 data.matrix(estimated_nnls_props)),
114 method.name = c("MuSiC", "NNLS")) + 144 method.name = c("MuSiC", "NNLS")) +
115 theme(axis.text.x = element_text(angle = -90), 145 theme(axis.text.x = element_text(angle = -90),
116 axis.text.y = element_text(size = 8)) + 146 axis.text.y = element_text(size = 8)) +
117 ggtitle(element_blank()) + theme_minimal() 147 ggtitle(element_blank()) + theme_minimal(), maxyscale)
118 148
119 ## Heatmap 149 ## Heatmap
120 plot_hmap <- Prop_heat_Est(list( 150 plot_hmap <- Prop_heat_Est(list(
121 data.matrix(estimated_music_props), 151 data.matrix(estimated_music_props),
122 data.matrix(estimated_nnls_props)), 152 data.matrix(estimated_nnls_props)),
123 method.name = c("MuSiC", "NNLS")) + 153 method.name = c("MuSiC", "NNLS")) +
124 theme(axis.text.x = element_text(angle = -90), 154 theme(axis.text.x = element_text(angle = -90),
125 axis.text.y = element_text(size = 6)) 155 axis.text.y = element_text(size = 6))
126 156
127 pdf(file = outfile_pdf, width = 8, height = 8) 157 pdf(file = outfile_pdf, width = 8, height = 8)
128 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) 158 if (length(celltypes) <= 8) {
129 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) 159 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)
160 } else {
161 print(jitter_fig)
162 plot_box
163 }
164 if (use_disease_factor) {
165 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)
166 }
130 plot_hmap 167 plot_hmap
131 message(dev.off()) 168 message(dev.off())
132 169
133 ## Output Proportions 170 ## Output Proportions
134 171
157 "Matrix of Variance of MuSiC Estimates", 194 "Matrix of Variance of MuSiC Estimates",
158 ".tabular"), 195 ".tabular"),
159 quote = F, sep = "\t", col.names = NA) 196 quote = F, sep = "\t", col.names = NA)
160 197
161 198
162 ## Summary table 199 if (use_disease_factor) {
163 for (meth in methods) { 200 ## Summary table of linear regressions of disease factors
164 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = 201 for (meth in methods) {
165 sub_data <- subset(m_prop_ana, Method == meth) 202 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =
166 ## We can only do regression where there are more than 1 factors 203 sub_data <- subset(m_prop_ana, Method == meth)
167 ## so we must find and exclude the ones which are not 204
168 gt1_facts <- sapply(phenotype_factors, function(facname) { 205 ## We can only do regression where there are more than 1 factors
169 return(length(unique(sort(sub_data[[facname]]))) == 1) 206 ## so we must find and exclude the ones which are not
170 }) 207 gt1_facts <- sapply(phenotype_factors, function(facname) {
171 form_factors <- phenotype_factors 208 return(length(unique(sort(sub_data[[facname]]))) == 1)
172 exclude_facts <- names(gt1_facts)[gt1_facts] 209 })
173 if (length(exclude_facts) > 0) { 210 form_factors <- phenotype_factors
174 message("Factors with only one level will be excluded:") 211 exclude_facts <- names(gt1_facts)[gt1_facts]
175 message(exclude_facts) 212 if (length(exclude_facts) > 0) {
176 form_factors <- phenotype_factors[ 213 message("Factors with only one level will be excluded:")
177 !(phenotype_factors %in% exclude_facts)] 214 message(exclude_facts)
215 form_factors <- phenotype_factors[
216 !(phenotype_factors %in% exclude_facts)]
217 }
218 lm_beta_meth <- lm(as.formula(
219 paste("ct.prop", paste(form_factors, collapse = " + "),
220 sep = " ~ ")), data = sub_data)
221 message(paste0("Summary: ", meth))
222 capture.output(summary(lm_beta_meth),
223 file = paste0("report_data/summ_Log of ",
224 meth,
225 " fitting.txt"))
178 } 226 }
179 lm_beta_meth <- lm(as.formula( 227 }
180 paste("ct.prop", paste(form_factors, collapse = " + "),
181 sep = " ~ ")), data = sub_data)
182 message(paste0("Summary: ", meth))
183 capture.output(summary(lm_beta_meth),
184 file = paste0("report_data/summ_Log of ",
185 meth,
186 " fitting.txt"))
187 }