annotate R_functions/plotting_functions.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64e75e21466e Uploaded
pmac
parents:
diff changeset
1 ## Plotting and grouping ##
64e75e21466e Uploaded
pmac
parents:
diff changeset
2 # input data: some number of 2d observations. Each row represents a single observation,
64e75e21466e Uploaded
pmac
parents:
diff changeset
3 # column 1 = variable 1, to be plotted on the x-axis,
64e75e21466e Uploaded
pmac
parents:
diff changeset
4 # column 2 = variable 2, to be plotted on the y-axis
64e75e21466e Uploaded
pmac
parents:
diff changeset
5 # groups: Integer vector with same number of entries as there are rows in the input data,
64e75e21466e Uploaded
pmac
parents:
diff changeset
6 # representing which group each observation belongs to. Negative numbers are not plotted
64e75e21466e Uploaded
pmac
parents:
diff changeset
7 # tags: the tag to put on the legend for each group
64e75e21466e Uploaded
pmac
parents:
diff changeset
8 # plot_colors: colors to use for each group
64e75e21466e Uploaded
pmac
parents:
diff changeset
9 # plot_symbols: symbols to use for each group
64e75e21466e Uploaded
pmac
parents:
diff changeset
10 # plot_title: as name suggests
64e75e21466e Uploaded
pmac
parents:
diff changeset
11 # plot_filename: if this is not null, graph is output to a png with the specified name
64e75e21466e Uploaded
pmac
parents:
diff changeset
12 plot_by_groups = function(input_data, groups, tags, plot_colors, plot_symbols, plot_title, plot_filename=NULL) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
13 if(!is.null(plot_filename)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
14 png(plot_filename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
15 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
16 # leave some extra room on the RHS for the legend
64e75e21466e Uploaded
pmac
parents:
diff changeset
17 par(mar=c(5.1, 4.1, 4.1, 8.1))
64e75e21466e Uploaded
pmac
parents:
diff changeset
18 x = as.numeric(input_data[, 1])
64e75e21466e Uploaded
pmac
parents:
diff changeset
19 y = as.numeric(input_data[, 2])
64e75e21466e Uploaded
pmac
parents:
diff changeset
20 gids = sort(unique(groups[which(groups >= 0)]))
64e75e21466e Uploaded
pmac
parents:
diff changeset
21 n = length(gids)
64e75e21466e Uploaded
pmac
parents:
diff changeset
22
64e75e21466e Uploaded
pmac
parents:
diff changeset
23 # first set up the plot area to the correct dimensions
64e75e21466e Uploaded
pmac
parents:
diff changeset
24 plot(x, y, col="white")
64e75e21466e Uploaded
pmac
parents:
diff changeset
25
64e75e21466e Uploaded
pmac
parents:
diff changeset
26 for (i in 1:n) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
27 gid = gids[i]
64e75e21466e Uploaded
pmac
parents:
diff changeset
28 pts_x = x[which(groups == gid)]
64e75e21466e Uploaded
pmac
parents:
diff changeset
29 pts_y = y[which(groups == gid)]
64e75e21466e Uploaded
pmac
parents:
diff changeset
30 pts_color = plot_colors[i]
64e75e21466e Uploaded
pmac
parents:
diff changeset
31 pts_symbol = plot_symbols[i]
64e75e21466e Uploaded
pmac
parents:
diff changeset
32 points(pts_x, pts_y, col=pts_color, pch=pts_symbol)
64e75e21466e Uploaded
pmac
parents:
diff changeset
33 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
34 legend(x="topright",
64e75e21466e Uploaded
pmac
parents:
diff changeset
35 xpd=TRUE,
64e75e21466e Uploaded
pmac
parents:
diff changeset
36 inset=c(-0.3, 0),
64e75e21466e Uploaded
pmac
parents:
diff changeset
37 col=plot_colors,
64e75e21466e Uploaded
pmac
parents:
diff changeset
38 pch=plot_symbols,
64e75e21466e Uploaded
pmac
parents:
diff changeset
39 legend=tags,
64e75e21466e Uploaded
pmac
parents:
diff changeset
40 text.col=plot_colors)
64e75e21466e Uploaded
pmac
parents:
diff changeset
41 title(main=plot_title)
64e75e21466e Uploaded
pmac
parents:
diff changeset
42 if(!is.null(plot_filename)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
43 dev.off()
64e75e21466e Uploaded
pmac
parents:
diff changeset
44 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
45 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
46
64e75e21466e Uploaded
pmac
parents:
diff changeset
47 # Controls vs cases plot. Colour controls blue, cases red,
64e75e21466e Uploaded
pmac
parents:
diff changeset
48 # Samples which are neither control nor case are black.
64e75e21466e Uploaded
pmac
parents:
diff changeset
49 setup_cvc_plot = function(pca_data, control_tag, cases_tag) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
50 plot_info = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
51 nsamples = length(pca_data$ids)
64e75e21466e Uploaded
pmac
parents:
diff changeset
52 groups = rep(1, nsamples)
64e75e21466e Uploaded
pmac
parents:
diff changeset
53 control_legend = paste0("CO: ", control_tag)
64e75e21466e Uploaded
pmac
parents:
diff changeset
54 cases_legend = paste0("CA: ", cases_tag)
64e75e21466e Uploaded
pmac
parents:
diff changeset
55 if (!is.null(control_tag)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
56 groups[grep(control_tag, pca_data$ids)] = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
57 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
58 if (!is.null(cases_tag)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
59 groups[grep(cases_tag, pca_data$ids)] = 3
64e75e21466e Uploaded
pmac
parents:
diff changeset
60 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
61 res = sort(unique(groups))
64e75e21466e Uploaded
pmac
parents:
diff changeset
62 if (length(res) == 1) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
63 tags = c("UNKNOWN")
64e75e21466e Uploaded
pmac
parents:
diff changeset
64 plot_colors = c("black")
64e75e21466e Uploaded
pmac
parents:
diff changeset
65 } else if (length(res) == 3) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
66 tags = c("UNKNOWN", control_legend, cases_legend)
64e75e21466e Uploaded
pmac
parents:
diff changeset
67 plot_colors = c("black", "blue", "red")
64e75e21466e Uploaded
pmac
parents:
diff changeset
68 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
69 if (all(res == c(1, 2))) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
70 tags = c("UNKNOWN", control_legend)
64e75e21466e Uploaded
pmac
parents:
diff changeset
71 plot_colors = c("black", "blue")
64e75e21466e Uploaded
pmac
parents:
diff changeset
72 } else if (all(res == c(1, 3))) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
73 tags = c("UNKNOWN", cases_legend)
64e75e21466e Uploaded
pmac
parents:
diff changeset
74 plot_colors = c("black", "red")
64e75e21466e Uploaded
pmac
parents:
diff changeset
75 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
76 tags = c(control_legend, cases_legend)
64e75e21466e Uploaded
pmac
parents:
diff changeset
77 plot_colors = c("blue", "red")
64e75e21466e Uploaded
pmac
parents:
diff changeset
78 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
79 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
80 plot_info$groups = groups
64e75e21466e Uploaded
pmac
parents:
diff changeset
81 plot_info$tags = tags
64e75e21466e Uploaded
pmac
parents:
diff changeset
82 plot_info$plot_colors = plot_colors
64e75e21466e Uploaded
pmac
parents:
diff changeset
83 plot_info$plot_symbols = rep(1, length(res))
64e75e21466e Uploaded
pmac
parents:
diff changeset
84 plot_info$plot_title = "Control vs Cases Plot"
64e75e21466e Uploaded
pmac
parents:
diff changeset
85 return(plot_info)
64e75e21466e Uploaded
pmac
parents:
diff changeset
86 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
87
64e75e21466e Uploaded
pmac
parents:
diff changeset
88 # outliers plot; colour outliers red, non-outliers green
64e75e21466e Uploaded
pmac
parents:
diff changeset
89 setup_ol_plot = function(pca_data, outliers) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
90 plot_info = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
91 nsamples = dim(pca_data$values)[1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
92 groups = 1:nsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
93 groups[outliers] = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
94 groups[setdiff(1:nsamples, outliers)] = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
95 plot_info$groups = groups
64e75e21466e Uploaded
pmac
parents:
diff changeset
96 plot_info$tags = c("outliers", "good data")
64e75e21466e Uploaded
pmac
parents:
diff changeset
97 plot_info$plot_colors = c("red", "green")
64e75e21466e Uploaded
pmac
parents:
diff changeset
98 plot_info$plot_symbols = c(1, 20)
64e75e21466e Uploaded
pmac
parents:
diff changeset
99 plot_info$plot_title = "Outliers Plot"
64e75e21466e Uploaded
pmac
parents:
diff changeset
100 return(plot_info)
64e75e21466e Uploaded
pmac
parents:
diff changeset
101 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
102
64e75e21466e Uploaded
pmac
parents:
diff changeset
103 # standard deviations plot; colour samples by s.dev
64e75e21466e Uploaded
pmac
parents:
diff changeset
104 setup_sd_plot = function(pca_data) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
105 plot_info = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
106 nsamples = dim(pca_data$values)[1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
107 pc1 = as.numeric(pca_data$values[, 1])
64e75e21466e Uploaded
pmac
parents:
diff changeset
108 pc2 = as.numeric(pca_data$values[, 2])
64e75e21466e Uploaded
pmac
parents:
diff changeset
109 pc1_sds = as.numeric(lapply(pc1, compute_numsds, pc1))
64e75e21466e Uploaded
pmac
parents:
diff changeset
110 pc2_sds = as.numeric(lapply(pc2, compute_numsds, pc2))
64e75e21466e Uploaded
pmac
parents:
diff changeset
111
64e75e21466e Uploaded
pmac
parents:
diff changeset
112 groups = 1:nsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
113 groups[get_sdset2d(pc1_sds, pc2_sds, 1)] = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
114 groups[get_sdset2d(pc1_sds, pc2_sds, 2)] = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
115 groups[get_sdset2d(pc1_sds, pc2_sds, 3)] = 3
64e75e21466e Uploaded
pmac
parents:
diff changeset
116 groups[union(which(pc1_sds > 3), which(pc2_sds > 3))] = 4
64e75e21466e Uploaded
pmac
parents:
diff changeset
117 plot_info$groups = groups
64e75e21466e Uploaded
pmac
parents:
diff changeset
118 plot_info$tags = c("SD = 1", "SD = 2", "SD = 3", "SD > 3")
64e75e21466e Uploaded
pmac
parents:
diff changeset
119 plot_info$plot_colors = rainbow(4)
64e75e21466e Uploaded
pmac
parents:
diff changeset
120 plot_info$plot_symbols = rep(20, 4)
64e75e21466e Uploaded
pmac
parents:
diff changeset
121 plot_info$plot_title = "Standard Deviations Plot"
64e75e21466e Uploaded
pmac
parents:
diff changeset
122 return(plot_info)
64e75e21466e Uploaded
pmac
parents:
diff changeset
123 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
124
64e75e21466e Uploaded
pmac
parents:
diff changeset
125 # Plot samples, with coloured clusters. Rejected clusters use
64e75e21466e Uploaded
pmac
parents:
diff changeset
126 # a cross symbol instead of a filled circle
64e75e21466e Uploaded
pmac
parents:
diff changeset
127 setup_cluster_plot = function(pca_data, clusters, rc=NULL) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
128 plot_info = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
129 groups = clusters
64e75e21466e Uploaded
pmac
parents:
diff changeset
130 ids = sort(unique(groups))
64e75e21466e Uploaded
pmac
parents:
diff changeset
131 n = length(ids)
64e75e21466e Uploaded
pmac
parents:
diff changeset
132 tags = 1:n
64e75e21466e Uploaded
pmac
parents:
diff changeset
133 for (i in 1:n) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
134 tags[i] = sprintf("cluster %s", ids[i])
64e75e21466e Uploaded
pmac
parents:
diff changeset
135 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
136 outliers = which(groups == 0)
64e75e21466e Uploaded
pmac
parents:
diff changeset
137 if (length(outliers) != 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
138 tags[1] = "outliers"
64e75e21466e Uploaded
pmac
parents:
diff changeset
139 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
140 plot_colors = rainbow(n)
64e75e21466e Uploaded
pmac
parents:
diff changeset
141 plot_symbols = rep(20, n)
64e75e21466e Uploaded
pmac
parents:
diff changeset
142 if (length(outliers) != 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
143 plot_symbols[1] = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
144 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
145 # labelling for rejected clusters
64e75e21466e Uploaded
pmac
parents:
diff changeset
146 if(!is.null(rc)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
147 for(i in 1:n) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
148 if((ids[i] != 0) && (ids[i] %in% as.numeric(rc))) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
149 tags[i] = "rej. clust."
64e75e21466e Uploaded
pmac
parents:
diff changeset
150 plot_symbols[i] = 4
64e75e21466e Uploaded
pmac
parents:
diff changeset
151 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
152 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
153 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
154 plot_info$groups = groups
64e75e21466e Uploaded
pmac
parents:
diff changeset
155 plot_info$tags = tags
64e75e21466e Uploaded
pmac
parents:
diff changeset
156 plot_info$plot_colors = plot_colors
64e75e21466e Uploaded
pmac
parents:
diff changeset
157 plot_info$plot_symbols = plot_symbols
64e75e21466e Uploaded
pmac
parents:
diff changeset
158 plot_info$plot_title = "Cluster Plot"
64e75e21466e Uploaded
pmac
parents:
diff changeset
159 return(plot_info)
64e75e21466e Uploaded
pmac
parents:
diff changeset
160 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
161
64e75e21466e Uploaded
pmac
parents:
diff changeset
162 # Plot samples, colouring by ethnicity. Different ethnicities also
64e75e21466e Uploaded
pmac
parents:
diff changeset
163 # have different symbols.
64e75e21466e Uploaded
pmac
parents:
diff changeset
164 setup_ethnicity_plot = function(pca_data, ethnicity_data) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
165 plot_info = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
166 nsamples = dim(pca_data$values)[1]
64e75e21466e Uploaded
pmac
parents:
diff changeset
167 eth = 1:nsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
168
64e75e21466e Uploaded
pmac
parents:
diff changeset
169 for (i in 1:nsamples) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
170 sample_id = pca_data$ids[i]
64e75e21466e Uploaded
pmac
parents:
diff changeset
171 eth[i] = as.character(ethnicity_data[sample_id, "population"])
64e75e21466e Uploaded
pmac
parents:
diff changeset
172 if(is.na(eth[i])) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
173 eth[i] = "UNKNOWN"
64e75e21466e Uploaded
pmac
parents:
diff changeset
174 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
175 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
176 n = length(unique(eth))
64e75e21466e Uploaded
pmac
parents:
diff changeset
177 plot_info$groups = as.numeric(as.factor(eth))
64e75e21466e Uploaded
pmac
parents:
diff changeset
178 plot_info$tags = sort(unique(eth))
64e75e21466e Uploaded
pmac
parents:
diff changeset
179 plot_info$plot_colors = rainbow(n)
64e75e21466e Uploaded
pmac
parents:
diff changeset
180 plot_info$plot_symbols = 1:n
64e75e21466e Uploaded
pmac
parents:
diff changeset
181 plot_info$plot_title = "Ethnicity Plot"
64e75e21466e Uploaded
pmac
parents:
diff changeset
182 return(plot_info)
64e75e21466e Uploaded
pmac
parents:
diff changeset
183 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
184
64e75e21466e Uploaded
pmac
parents:
diff changeset
185 draw_cutoffs = function(input_data, x, y, numsds) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
186 pcx = as.numeric(input_data[x, ])
64e75e21466e Uploaded
pmac
parents:
diff changeset
187 pcy = as.numeric(input_data[y, ])
64e75e21466e Uploaded
pmac
parents:
diff changeset
188
64e75e21466e Uploaded
pmac
parents:
diff changeset
189 vlines = c(median(pcx) - numsds*sd(pcx),
64e75e21466e Uploaded
pmac
parents:
diff changeset
190 median(pcx) + numsds*sd(pcx))
64e75e21466e Uploaded
pmac
parents:
diff changeset
191 hlines = c(median(pcy) - numsds*sd(pcy),
64e75e21466e Uploaded
pmac
parents:
diff changeset
192 median(pcy) + numsds*sd(pcy))
64e75e21466e Uploaded
pmac
parents:
diff changeset
193 abline(v=vlines)
64e75e21466e Uploaded
pmac
parents:
diff changeset
194 abline(h=hlines)
64e75e21466e Uploaded
pmac
parents:
diff changeset
195 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
196
64e75e21466e Uploaded
pmac
parents:
diff changeset
197 # Following helper functions are used in the 'setup_sd_plot' function
64e75e21466e Uploaded
pmac
parents:
diff changeset
198 # given a list of standard deviations, work out which points are n standard deviations away
64e75e21466e Uploaded
pmac
parents:
diff changeset
199 get_sdset2d = function(x1, x2, n) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
200 if (n == 1) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
201 ind = intersect(which(x1 == 1), which(x2 == 1))
64e75e21466e Uploaded
pmac
parents:
diff changeset
202 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
203 lower = get_sdset2d(x1, x2, n - 1)
64e75e21466e Uploaded
pmac
parents:
diff changeset
204 upper = union(which(x1 > n), which(x2 > n))
64e75e21466e Uploaded
pmac
parents:
diff changeset
205 xset = union(lower, upper)
64e75e21466e Uploaded
pmac
parents:
diff changeset
206 bigset = union(which(x1 == n), which(x2 == n))
64e75e21466e Uploaded
pmac
parents:
diff changeset
207 ind = setdiff(bigset, xset)
64e75e21466e Uploaded
pmac
parents:
diff changeset
208 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
209 return(ind)
64e75e21466e Uploaded
pmac
parents:
diff changeset
210 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
211
64e75e21466e Uploaded
pmac
parents:
diff changeset
212 # work out how many standard deviations away from the sample median a single point is
64e75e21466e Uploaded
pmac
parents:
diff changeset
213 # accuracy of this decreases for outliers, as the error in the estimated sd is
64e75e21466e Uploaded
pmac
parents:
diff changeset
214 # multiplied
64e75e21466e Uploaded
pmac
parents:
diff changeset
215 compute_numsds = function(point, x) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
216 x_sd = sd(x)
64e75e21466e Uploaded
pmac
parents:
diff changeset
217 sum = x_sd
64e75e21466e Uploaded
pmac
parents:
diff changeset
218 m = median(x)
64e75e21466e Uploaded
pmac
parents:
diff changeset
219 i = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
220 while(abs(point - m) > sum) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
221 i = i + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
222 sum = sum + x_sd
64e75e21466e Uploaded
pmac
parents:
diff changeset
223 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
224 return(i)
64e75e21466e Uploaded
pmac
parents:
diff changeset
225 }