3
|
1 ---
|
|
2 title: 'Aurora Galaxy WGCNA Tool: Gene Co-Expression Network Construction & Analysis. Part 2'
|
|
3 output:
|
16
|
4 html_document:
|
3
|
5 number_sections: false
|
|
6 ---
|
|
7
|
|
8 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
|
|
9 knitr::opts_chunk$set(error = FALSE, echo = FALSE)
|
|
10 ```
|
11
|
11 ```{r, include=FALSE}
|
|
12 options(tinytex.verbose = TRUE)
|
|
13 ```
|
3
|
14 ```{r}
|
|
15 # Load the data from the previous step.
|
|
16 load(file=opt$r_data)
|
|
17 ```
|
|
18 # Introduction
|
|
19 This report is part two of step-by-step results from use of the [Aurora Galaxy](https://github.com/statonlab/aurora-galaxy-tools) Weighted Gene Co-expression Network Analysis [WGCNA](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559) tool. It is generated when trait or phenotype data is provided.
|
|
20
|
|
21 This report was generated on:
|
|
22 ```{r}
|
|
23 format(Sys.time(), "%a %b %d %X %Y")
|
|
24 ```
|
|
25
|
|
26 # Trait/Phenotype Data
|
|
27
|
|
28 The contents below show the first 10 rows and 6 columns of trait/phenotype data provided. However, any columns that were indicated should be removed, were removed and any categorical columns specified were converte to a one-hot enconding (e.g. 0 when present 1 when not present). The updated trait/phenotype data matrix has been saved into a comma-separated file named `updated_trait_matrix.csv`.
|
|
29
|
|
30 ```{r}
|
|
31 # Load the trait data file.
|
|
32 trait_data = data.frame()
|
|
33 trait_data = read.csv(opt$trait_data, header = TRUE, row.names = opt$sname_col, na.strings = opt$missing_value2)
|
|
34 sample_names = rownames(gemt)
|
|
35 trait_rows = match(sample_names, rownames(trait_data))
|
|
36 trait_data = trait_data[trait_rows, ]
|
|
37
|
|
38 # Determine the column types within the trait annotation data.
|
|
39 trait_types = sapply(trait_data, class)
|
|
40
|
|
41 # If the type is character we should convert it to a factor manually.
|
|
42 character_fields = colnames(trait_data)[which(trait_types == "character")]
|
|
43 if (length(character_fields) > 0) {
|
|
44 for (field in character_fields) {
|
|
45 trait_data[[field]] = as.factor(trait_data[[field]])
|
|
46 }
|
|
47 }
|
|
48
|
|
49 # Remove ignored columns.
|
|
50 ignore_cols = strsplit(opt$ignore_cols, ',')[[1]]
|
|
51 if (length(ignore_cols) > 0) {
|
|
52 print('You chose to ignore the following fields:')
|
|
53 print(ignore_cols)
|
|
54 trait_data = trait_data[, colnames(trait_data)[!(colnames(trait_data) %in% ignore_cols)]]
|
|
55 }
|
|
56
|
|
57 # Make sure we don't one-hot-encoude any columns that were also ignored.
|
|
58 one_hot_cols = strsplit(opt$one_hot_cols, ',')[[1]]
|
|
59 one_hot_cols = one_hot_cols[which(!(one_hot_cols %in% ignore_cols))]
|
|
60
|
|
61 # Change any categorical fields to 1 hot encoding as requested by the caller.
|
|
62 if (length(one_hot_cols) > 0) {
|
|
63 print('You chose to treat the following fields as categorical:')
|
|
64 print(one_hot_cols)
|
|
65
|
|
66 # Make sure we have enough levels for 1-hot encoding. We must have at least two.
|
|
67 hkeep = c()
|
|
68 hignore = c()
|
|
69 for (field in one_hot_cols[[i]]) {
|
11
|
70
|
3
|
71 # Make sure the field is categorical. If it came in as integer it must be switched.
|
|
72 if (trait_types[[field]] == "integer") {
|
|
73 trait_data[[field]] = as.factor(trait_data[[field]])
|
|
74 }
|
|
75 if (trait_types[[field]] == "numeric") {
|
|
76 print('The following quantitative field will be treated as numeric instead.')
|
|
77 print(field)
|
|
78 next
|
|
79 }
|
11
|
80
|
3
|
81 # Now make sure we have enough factors.
|
|
82 if (nlevels(trait_data[[field]]) > 1) {
|
|
83 hkeep[length(hkeep)+1] = field
|
|
84 } else {
|
|
85 hignore[length(hignore)+1] = field
|
|
86 }
|
|
87 }
|
11
|
88
|
3
|
89 if (length(hignore) > 0) {
|
|
90 print('These fields were ignored due to too few factors:')
|
|
91 print(hignore)
|
|
92 }
|
11
|
93
|
3
|
94 # Perform the 1-hot encoding for specified and valid fields.
|
|
95 if (length(hkeep) > 0) {
|
|
96 print('These fields were be 1-hot encoded:')
|
|
97 print(hkeep)
|
11
|
98
|
3
|
99 swap_cols = colnames(trait_data)[(colnames(trait_data) %in% hkeep)]
|
|
100 temp = as.data.frame(trait_data[, swap_cols])
|
|
101 colnames(temp) = swap_cols
|
|
102 temp = apply(temp, 2, make.names)
|
|
103 dmy <- dummyVars(" ~ .", data = temp)
|
|
104 encoded <- data.frame(predict(dmy, newdata = temp))
|
|
105 encoded = sapply(encoded, as.integer)
|
11
|
106
|
3
|
107 # Make a new trait_data table with these new 1-hot fields.
|
|
108 keep_cols = colnames(trait_data)[!(colnames(trait_data) %in% one_hot_cols[[1]])]
|
|
109 keep = as.data.frame(trait_data[, keep_cols])
|
|
110 colnames(keep) = keep_cols
|
11
|
111
|
3
|
112 # Make a new trait_data object that has the columns to keep and the new 1-hot columns.
|
|
113 trait_data = cbind(keep, encoded)
|
|
114 }
|
|
115 }
|
|
116
|
|
117 # Write the new trait data file.
|
|
118 write.csv(trait_data, file=opt$updated_trait_matrix, quote=FALSE)
|
|
119
|
|
120 #datatable(trait_data)
|
|
121 trait_data[1:10,1:6]
|
|
122 ```
|
|
123 # Module-Condition Association
|
|
124
|
|
125 Now that we have trait/phenotype data, we can explore if any of the network modules are asociated with these features. First, is an empirical exploration by viewing again the sample dendrogram but with traits added and colored by category or numerical intensity, as appropriate. If groups of samples with similar expression also share similar annotations then the same colors will appear "in blocks" under the clustered samples. This view does not indicate associations but can help visualize when some modules might be associated.
|
|
126
|
|
127 ```{r}
|
|
128 # Determine the column types within the trait annotation data.
|
|
129 trait_types = sapply(trait_data, class)
|
|
130
|
|
131 # So that we can merge colors together with a cbind, create a
|
|
132 # data frame with an empty column
|
|
133 trait_colors = data.frame(empty = rep(1:dim(trait_data)[1]))
|
|
134
|
|
135 # Set the colors for the quantitative data.
|
|
136 quantitative_fields = colnames(trait_data)[which(trait_types == "numeric")]
|
|
137 if (length(quantitative_fields) > 0) {
|
|
138 qdata = as.data.frame(trait_data[,quantitative_fields])
|
|
139 quantitative_colors = numbers2colors(qdata, signed = FALSE)
|
|
140 colnames(quantitative_colors) = quantitative_fields
|
|
141 trait_colors = cbind(trait_colors, quantitative_colors)
|
|
142 }
|
|
143
|
|
144 # Set the colors for the categorical data but only if the column
|
|
145 # has more than one factor. For columns with more than one factor
|
|
146 # we should dump that column.
|
|
147 categorical_fields = colnames(trait_data)[which(trait_types == "factor")]
|
|
148 if (length(categorical_fields) > 0) {
|
|
149 cdata = as.data.frame(trait_data[,categorical_fields])
|
|
150 categorical_colors = labels2colors(cdata)
|
|
151 colnames(categorical_colors) = categorical_fields
|
|
152 trait_colors = cbind(trait_colors, categorical_colors)
|
|
153 }
|
|
154
|
|
155 # Set the colors for the ordinal data.
|
|
156 ordinal_fields = colnames(trait_data)[which(trait_types == "integer")]
|
|
157 if (length(ordinal_fields) > 0) {
|
|
158 odata = as.data.frame(trait_data[,ordinal_fields])
|
|
159 ordinal_colors = numbers2colors(odata, signed = FALSE)
|
|
160 colnames(ordinal_colors) = ordinal_fields
|
|
161 trait_colors = cbind(trait_colors, ordinal_colors)
|
|
162 }
|
|
163
|
|
164 # Reorder the colors to match the same order of columns in the trait_data df.
|
|
165 trait_order = colnames(trait_data)[colnames(trait_data) %in% colnames(trait_colors)]
|
|
166 trait_colors = trait_colors[,trait_order]
|
|
167 trait_data = trait_data[,trait_order]
|
|
168 plotSampleDendroTraits <- function() {
|
|
169 plotDendroAndColors(sampleTree, trait_colors,
|
|
170 groupLabels = names(trait_data),
|
|
171 main = "Sample Dendrogram and Annotation Heatmap",
|
|
172 cex.dendroLabels = 0.5)
|
|
173 }
|
|
174
|
|
175 png('figures/07-sample_trait_dendrogram.png', width=6 ,height=10, units="in", res=300)
|
|
176 plotSampleDendroTraits()
|
|
177 invisible(dev.off())
|
|
178 plotSampleDendroTraits()
|
|
179 ```
|
|
180
|
|
181 To statistically identify the associations, correlation tests are performed of the eigengenes of each module with the annotation data. The following heatmap shows the results between each annotation feature and each module. Modules with a signficant positive assocation have a correlation value near 1. Modules with a significant negative association have a correlation value near -1. Modules with no correlation have a value near 0.
|
|
182
|
|
183 ```{r fig.align='center', fig.width=15, fig.height=15}
|
|
184 MEs = orderMEs(MEs)
|
|
185 moduleTraitCor = cor(MEs, trait_data, use = "p");
|
|
186 moduleTraitPvalue = corPvalueStudent(moduleTraitCor, n_samples);
|
|
187
|
|
188 plotModuleTraitHeatmap <- function() {
|
|
189 # The WGCNA labeledHeatmap function is too overloaded with detail, we'll create a simpler plot.
|
|
190 plotData = melt(moduleTraitCor)
|
|
191 # We want to makes sure the order is the same as in the
|
|
192 # labeledHeatmap function (example above)
|
|
193 plotData$Var1 = factor(plotData$Var1, levels = rev(colnames(MEs)), ordered=TRUE)
|
|
194 # Now use ggplot2 to make a nicer image.
|
|
195 p <- ggplot(plotData, aes(Var2, Var1, fill=value)) +
|
|
196 geom_tile() + xlab('Experimental Conditions') + ylab('WGCNA Modules') +
|
|
197 scale_fill_gradient2(low = "#0072B2", high = "#D55E00",
|
|
198 mid = "white", midpoint = 0,
|
|
199 limit = c(-1,1), name="PCC") +
|
|
200 theme_bw() +
|
|
201 theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1, size=15),
|
|
202 axis.text.y = element_text(angle = 0, hjust=1, vjust=0.5, size=15),
|
|
203 legend.text=element_text(size=15),
|
|
204 panel.border = element_blank(),
|
|
205 panel.grid.major = element_blank(),
|
|
206 panel.grid.minor = element_blank(),
|
|
207 axis.line = element_blank())
|
|
208 print(p)
|
|
209 }
|
|
210 png('figures/08-module_trait_dendrogram.png', width=12 ,height=12, units="in", res=300)
|
|
211 plotModuleTraitHeatmap()
|
|
212 invisible(dev.off())
|
|
213 plotModuleTraitHeatmap()
|
|
214 ```
|
|
215
|
|
216 ```{r}
|
|
217 output = cbind(moduleTraitCor, moduleTraitPvalue)
|
|
218 write.csv(output, file = opt$module_association_file, quote=FALSE, row.names=TRUE)
|
|
219 ```
|
|
220 A file has been generated named `module_association.csv` which conatins the list of modules, and their correlation values as well as p-values indicating the strength of the associations.
|
|
221 ```{r}
|
|
222 # names (colors) of the modules
|
|
223 modNames = substring(names(MEs), 3)
|
|
224 geneModuleMembership = as.data.frame(cor(gemt, MEs, use = "p"));
|
|
225 MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), n_samples));
|
|
226 names(geneModuleMembership) = paste("MM", modNames, sep="");
|
|
227 names(MMPvalue) = paste("p.MM", modNames, sep="");
|
|
228
|
|
229 # Calculate the gene trait significance as a Pearson's R and p-value.
|
|
230 gts = as.data.frame(cor(gemt, trait_data, use = "p"));
|
|
231 gtsp = as.data.frame(corPvalueStudent(as.matrix(gts), n_samples));
|
|
232 colnames(gtsp) = c(paste("p", names(trait_data), sep="."))
|
|
233 colnames(gts) = c(paste("GS", names(trait_data), sep="."))
|
|
234
|
|
235 # Write out the gene information.
|
|
236 output = cbind(Module = module_labels, gts, gtsp)
|
|
237 write.csv(output, file = opt$gene_association_file, quote=FALSE, row.names=TRUE)
|
|
238
|
|
239 ```
|
|
240 Genes themselves can also have assocation with traits. This is calculated via a traditional correlation test as well. Another file has been generated named `gene_association.csv` which provides the list of genes, the modules they belong to and the assocaition of each gene to the trait features.
|