comparison aurora_wgcna_trait.Rmd @ 3:d1a0b7ded7e3 draft

Uploaded
author spficklin
date Fri, 22 Nov 2019 19:45:51 -0500
parents
children f2d2ec70b7d8
comparison
equal deleted inserted replaced
2:9e8788803adc 3:d1a0b7ded7e3
1 ---
2 title: 'Aurora Galaxy WGCNA Tool: Gene Co-Expression Network Construction & Analysis. Part 2'
3 output:
4 pdf_document:
5 number_sections: false
6 ---
7
8 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
9 knitr::opts_chunk$set(error = FALSE, echo = FALSE)
10 ```
11 ```{r}
12 # Load the data from the previous step.
13 load(file=opt$r_data)
14 ```
15 # Introduction
16 This report is part two of step-by-step results from use of the [Aurora Galaxy](https://github.com/statonlab/aurora-galaxy-tools) Weighted Gene Co-expression Network Analysis [WGCNA](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559) tool. It is generated when trait or phenotype data is provided.
17
18 This report was generated on:
19 ```{r}
20 format(Sys.time(), "%a %b %d %X %Y")
21 ```
22
23 # Trait/Phenotype Data
24
25 The contents below show the first 10 rows and 6 columns of trait/phenotype data provided. However, any columns that were indicated should be removed, were removed and any categorical columns specified were converte to a one-hot enconding (e.g. 0 when present 1 when not present). The updated trait/phenotype data matrix has been saved into a comma-separated file named `updated_trait_matrix.csv`.
26
27 ```{r}
28 # Load the trait data file.
29 trait_data = data.frame()
30 trait_data = read.csv(opt$trait_data, header = TRUE, row.names = opt$sname_col, na.strings = opt$missing_value2)
31 sample_names = rownames(gemt)
32 trait_rows = match(sample_names, rownames(trait_data))
33 trait_data = trait_data[trait_rows, ]
34
35 # Determine the column types within the trait annotation data.
36 trait_types = sapply(trait_data, class)
37
38 # If the type is character we should convert it to a factor manually.
39 character_fields = colnames(trait_data)[which(trait_types == "character")]
40 if (length(character_fields) > 0) {
41 for (field in character_fields) {
42 trait_data[[field]] = as.factor(trait_data[[field]])
43 }
44 }
45
46 # Remove ignored columns.
47 ignore_cols = strsplit(opt$ignore_cols, ',')[[1]]
48 if (length(ignore_cols) > 0) {
49 print('You chose to ignore the following fields:')
50 print(ignore_cols)
51 trait_data = trait_data[, colnames(trait_data)[!(colnames(trait_data) %in% ignore_cols)]]
52 }
53
54 # Make sure we don't one-hot-encoude any columns that were also ignored.
55 one_hot_cols = strsplit(opt$one_hot_cols, ',')[[1]]
56 one_hot_cols = one_hot_cols[which(!(one_hot_cols %in% ignore_cols))]
57
58 # Change any categorical fields to 1 hot encoding as requested by the caller.
59 if (length(one_hot_cols) > 0) {
60 print('You chose to treat the following fields as categorical:')
61 print(one_hot_cols)
62
63 # Make sure we have enough levels for 1-hot encoding. We must have at least two.
64 hkeep = c()
65 hignore = c()
66 for (field in one_hot_cols[[i]]) {
67
68 # Make sure the field is categorical. If it came in as integer it must be switched.
69 if (trait_types[[field]] == "integer") {
70 trait_data[[field]] = as.factor(trait_data[[field]])
71 }
72 if (trait_types[[field]] == "numeric") {
73 print('The following quantitative field will be treated as numeric instead.')
74 print(field)
75 next
76 }
77
78 # Now make sure we have enough factors.
79 if (nlevels(trait_data[[field]]) > 1) {
80 hkeep[length(hkeep)+1] = field
81 } else {
82 hignore[length(hignore)+1] = field
83 }
84 }
85
86 if (length(hignore) > 0) {
87 print('These fields were ignored due to too few factors:')
88 print(hignore)
89 }
90
91 # Perform the 1-hot encoding for specified and valid fields.
92 if (length(hkeep) > 0) {
93 print('These fields were be 1-hot encoded:')
94 print(hkeep)
95
96 swap_cols = colnames(trait_data)[(colnames(trait_data) %in% hkeep)]
97 temp = as.data.frame(trait_data[, swap_cols])
98 colnames(temp) = swap_cols
99 temp = apply(temp, 2, make.names)
100 dmy <- dummyVars(" ~ .", data = temp)
101 encoded <- data.frame(predict(dmy, newdata = temp))
102 encoded = sapply(encoded, as.integer)
103
104 # Make a new trait_data table with these new 1-hot fields.
105 keep_cols = colnames(trait_data)[!(colnames(trait_data) %in% one_hot_cols[[1]])]
106 keep = as.data.frame(trait_data[, keep_cols])
107 colnames(keep) = keep_cols
108
109 # Make a new trait_data object that has the columns to keep and the new 1-hot columns.
110 trait_data = cbind(keep, encoded)
111 }
112 }
113
114 # Write the new trait data file.
115 write.csv(trait_data, file=opt$updated_trait_matrix, quote=FALSE)
116
117 #datatable(trait_data)
118 trait_data[1:10,1:6]
119 ```
120 # Module-Condition Association
121
122 Now that we have trait/phenotype data, we can explore if any of the network modules are asociated with these features. First, is an empirical exploration by viewing again the sample dendrogram but with traits added and colored by category or numerical intensity, as appropriate. If groups of samples with similar expression also share similar annotations then the same colors will appear "in blocks" under the clustered samples. This view does not indicate associations but can help visualize when some modules might be associated.
123
124 ```{r}
125 # Determine the column types within the trait annotation data.
126 trait_types = sapply(trait_data, class)
127
128 # So that we can merge colors together with a cbind, create a
129 # data frame with an empty column
130 trait_colors = data.frame(empty = rep(1:dim(trait_data)[1]))
131
132 # Set the colors for the quantitative data.
133 quantitative_fields = colnames(trait_data)[which(trait_types == "numeric")]
134 if (length(quantitative_fields) > 0) {
135 qdata = as.data.frame(trait_data[,quantitative_fields])
136 quantitative_colors = numbers2colors(qdata, signed = FALSE)
137 colnames(quantitative_colors) = quantitative_fields
138 trait_colors = cbind(trait_colors, quantitative_colors)
139 }
140
141 # Set the colors for the categorical data but only if the column
142 # has more than one factor. For columns with more than one factor
143 # we should dump that column.
144 categorical_fields = colnames(trait_data)[which(trait_types == "factor")]
145 if (length(categorical_fields) > 0) {
146 cdata = as.data.frame(trait_data[,categorical_fields])
147 categorical_colors = labels2colors(cdata)
148 colnames(categorical_colors) = categorical_fields
149 trait_colors = cbind(trait_colors, categorical_colors)
150 }
151
152 # Set the colors for the ordinal data.
153 ordinal_fields = colnames(trait_data)[which(trait_types == "integer")]
154 if (length(ordinal_fields) > 0) {
155 odata = as.data.frame(trait_data[,ordinal_fields])
156 ordinal_colors = numbers2colors(odata, signed = FALSE)
157 colnames(ordinal_colors) = ordinal_fields
158 trait_colors = cbind(trait_colors, ordinal_colors)
159 }
160
161 # Reorder the colors to match the same order of columns in the trait_data df.
162 trait_order = colnames(trait_data)[colnames(trait_data) %in% colnames(trait_colors)]
163 trait_colors = trait_colors[,trait_order]
164 trait_data = trait_data[,trait_order]
165 plotSampleDendroTraits <- function() {
166 plotDendroAndColors(sampleTree, trait_colors,
167 groupLabels = names(trait_data),
168 main = "Sample Dendrogram and Annotation Heatmap",
169 cex.dendroLabels = 0.5)
170 }
171
172 png('figures/07-sample_trait_dendrogram.png', width=6 ,height=10, units="in", res=300)
173 plotSampleDendroTraits()
174 invisible(dev.off())
175 plotSampleDendroTraits()
176 ```
177
178 To statistically identify the associations, correlation tests are performed of the eigengenes of each module with the annotation data. The following heatmap shows the results between each annotation feature and each module. Modules with a signficant positive assocation have a correlation value near 1. Modules with a significant negative association have a correlation value near -1. Modules with no correlation have a value near 0.
179
180 ```{r fig.align='center', fig.width=15, fig.height=15}
181 MEs = orderMEs(MEs)
182 moduleTraitCor = cor(MEs, trait_data, use = "p");
183 moduleTraitPvalue = corPvalueStudent(moduleTraitCor, n_samples);
184
185 plotModuleTraitHeatmap <- function() {
186 # The WGCNA labeledHeatmap function is too overloaded with detail, we'll create a simpler plot.
187 plotData = melt(moduleTraitCor)
188 # We want to makes sure the order is the same as in the
189 # labeledHeatmap function (example above)
190 plotData$Var1 = factor(plotData$Var1, levels = rev(colnames(MEs)), ordered=TRUE)
191 # Now use ggplot2 to make a nicer image.
192 p <- ggplot(plotData, aes(Var2, Var1, fill=value)) +
193 geom_tile() + xlab('Experimental Conditions') + ylab('WGCNA Modules') +
194 scale_fill_gradient2(low = "#0072B2", high = "#D55E00",
195 mid = "white", midpoint = 0,
196 limit = c(-1,1), name="PCC") +
197 theme_bw() +
198 theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1, size=15),
199 axis.text.y = element_text(angle = 0, hjust=1, vjust=0.5, size=15),
200 legend.text=element_text(size=15),
201 panel.border = element_blank(),
202 panel.grid.major = element_blank(),
203 panel.grid.minor = element_blank(),
204 axis.line = element_blank())
205 print(p)
206 }
207 png('figures/08-module_trait_dendrogram.png', width=12 ,height=12, units="in", res=300)
208 plotModuleTraitHeatmap()
209 invisible(dev.off())
210 plotModuleTraitHeatmap()
211 ```
212
213 ```{r}
214 output = cbind(moduleTraitCor, moduleTraitPvalue)
215 write.csv(output, file = opt$module_association_file, quote=FALSE, row.names=TRUE)
216 ```
217 A file has been generated named `module_association.csv` which conatins the list of modules, and their correlation values as well as p-values indicating the strength of the associations.
218 ```{r}
219 # names (colors) of the modules
220 modNames = substring(names(MEs), 3)
221 geneModuleMembership = as.data.frame(cor(gemt, MEs, use = "p"));
222 MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), n_samples));
223 names(geneModuleMembership) = paste("MM", modNames, sep="");
224 names(MMPvalue) = paste("p.MM", modNames, sep="");
225
226 # Calculate the gene trait significance as a Pearson's R and p-value.
227 gts = as.data.frame(cor(gemt, trait_data, use = "p"));
228 gtsp = as.data.frame(corPvalueStudent(as.matrix(gts), n_samples));
229 colnames(gtsp) = c(paste("p", names(trait_data), sep="."))
230 colnames(gts) = c(paste("GS", names(trait_data), sep="."))
231
232 # Write out the gene information.
233 output = cbind(Module = module_labels, gts, gtsp)
234 write.csv(output, file = opt$gene_association_file, quote=FALSE, row.names=TRUE)
235
236 ```
237 Genes themselves can also have assocation with traits. This is calculated via a traditional correlation test as well. Another file has been generated named `gene_association.csv` which provides the list of genes, the modules they belong to and the assocaition of each gene to the trait features.