annotate Dotplot_Release/Step4_biclustering.R @ 1:7e98f893e9ae draft

Uploaded
author bornea
date Fri, 29 Jan 2016 09:57:31 -0500
parents dfa3436beb67
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
1 #!/usr/bin/env Rscript
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
2
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
3 args <- commandArgs(trailingOnly = TRUE)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
4
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
5 d = read.delim(args[1], header=T, sep="\t", as.is=T, row.names=1)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
6
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
7 clusters = read.delim("Clusters", header=T, sep="\t", as.is=T)[,-1]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
8 clusters = data.frame(Bait=colnames(clusters), Cluster=as.numeric(clusters[1,]))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
9 nested.clusters = read.delim("NestedClusters", header=F, sep="\t", as.is=T)[1:dim(d)[1],]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
10 nested.phi = read.delim("NestedMu", header=F, sep="\t", as.is=T)[1:dim(d)[1],]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
11 nested.phi2 = read.delim("NestedSigma2", header=F, sep="\t", as.is=T)[1:dim(d)[1],]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
12 mcmc = read.delim("MCMCparameters", header=F, sep="\t", as.is=T)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
13
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
14 ### distance between bait using phi (also reorder cluster names)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
15 ### report nested clusters with positive counts only
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
16 ### rearrange rows and columns of the raw data matrix according to the back-tracking algorithm
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
17
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
18 recursivePaste = function(x) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
19 n = length(x)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
20 x = x[order(x)]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
21 y = x[1]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
22 if(n > 1) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
23 for(i in 2:n) y = paste(y, x[i], sep="-")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
24 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
25 y
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
26 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
27
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
28 calcDist = function(x, y) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
29 if(length(x) != length(y)) stop("different length\n")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
30 else res = sum(abs(x-y))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
31 res
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
32 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
33
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
34
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
35 #clusters, nested.clusters, nested.phi, d
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
36
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
37 bcl = clusters
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
38 pcl = nested.clusters
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
39 phi = nested.phi
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
40 phi2 = nested.phi2
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
41 dat = d
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
42
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
43
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
44 ## bipartite graph
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
45 make.graphlet = function(b,p,s) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
46 g = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
47 g$b = b
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
48 g$p = p
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
49 g$s = as.numeric(s)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
50 g
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
51 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
52
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
53 make.hub = function(b,p) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
54 g = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
55 g$b = b
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
56 g$p = p
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
57 g
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
58 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
59
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
60 jaccard = function(x,y) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
61 j = length(intersect(x,y)) / length(union(x,y))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
62 j
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
63 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
64
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
65 merge.graphlets = function(x, y) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
66 g = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
67 g$b = union(x$b, y$b)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
68 g$p = union(x$p, y$p)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
69 g$s1 = rep(0,length(g$p))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
70 g$s2 = rep(0,length(g$p))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
71 g$s1[match(x$p, g$p)] = x$s
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
72 g$s2[match(y$p, g$p)] = y$s
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
73 g$s = apply(cbind(g$s1, g$s2), 1, max)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
74 g
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
75 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
76
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
77 summarizeDP = function(bcl, pcl, phi, phi2, dat, hub.size=0.5, ...) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
78 pcl = as.matrix(pcl)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
79 phi = as.matrix(phi)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
80 phi2 = as.matrix(phi2)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
81 dat = as.matrix(dat)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
82 rownames(phi) = rownames(dat)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
83 rownames(phi2) = rownames(dat)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
84
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
85 ubcl = unique(as.numeric(bcl$Cluster))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
86 n = length(ubcl)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
87 pcl = pcl[,ubcl]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
88 phi = phi[,ubcl]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
89 phi2 = phi2[,ubcl]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
90 phi[phi < 0.05] = 0
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
91
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
92 bcl$Cluster = match(as.numeric(bcl$Cluster), ubcl)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
93 colnames(pcl) = colnames(phi) = colnames(phi2) = paste("CL", 1:n, sep="")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
94
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
95 ## remove non-reproducible mean values
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
96 nprey = dim(dat)[1]; nbait = dim(dat)[2]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
97 preys = rownames(dat); baits = colnames(dat)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
98 n = length(unique(bcl$Cluster))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
99 for(j in 1:n) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
100 id = c(1:nbait)[bcl$Cluster == j]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
101 for(k in 1:nprey) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
102 do.it = ifelse(mean(as.numeric(dat[k,id]) > 0) <= 0.5,TRUE,FALSE)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
103 if(do.it) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
104 phi[k,j] = 0
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
105 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
106 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
107 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
108
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
109 ## create bipartite graphs (graphlets)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
110 gr = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
111 for(j in 1:n) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
112 id = c(1:nbait)[bcl$Cluster == j]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
113 id2 = c(1:nprey)[phi[,j] > 0]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
114 gr[[j]] = make.graphlet(baits[id], preys[id2], phi[id2,j])
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
115 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
116
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
117 ## intersecting preys between graphlets
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
118 gr2 = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
119 cur = 1
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
120 for(i in 1:n) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
121 for(j in 1:n) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
122 if(i != j) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
123 combine = jaccard(gr[[i]]$p, gr[[j]]$p) >= 0.75
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
124 if(combine) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
125 gr2[[cur]] = merge.graphlets(gr[[i]], gr[[j]])
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
126 cur = cur + 1
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
127 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
128 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
129 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
130 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
131
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
132 old.phi = phi
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
133 phi = phi[, bcl$Cluster]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
134 phi2 = phi2[, bcl$Cluster]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
135 ## find hub preys
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
136 proceed = apply(old.phi, 1, function(x) sum(x>0) >= 2)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
137 h = NULL
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
138 cur = 1
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
139 for(k in 1:nprey) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
140 if(proceed[k]) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
141 id = as.numeric(phi[k,]) > 0
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
142 if(mean(id) >= hub.size) {
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
143 h[[cur]] = make.hub(baits[id], preys[k])
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
144 cur = cur + 1
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
145 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
146 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
147 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
148 nhub = cur - 1
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
149
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
150 res = list(data=dat, baitCL=bcl, phi=phi, phi2=phi2, gr = gr, gr2 = gr2, hub = h)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
151 res
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
152 }
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
153
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
154 res = summarizeDP(clusters, nested.clusters, nested.phi, nested.phi2, d)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
155
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
156 write.table(res$baitCL[order(res$baitCL$Cluster),], "baitClusters", sep="\t", quote=F, row.names=F)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
157 write.table(res$data, "clusteredData", sep="\t", quote=F)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
158
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
159 ##### SOFT
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
160 library(gplots)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
161 tmpd = res$data
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
162 tmpm = res$phi
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
163 colnames(tmpm) = paste(colnames(res$data), colnames(tmpm))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
164
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
165 pdf("estimated.pdf", height=25, width=8)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
166 my.hclust<-hclust(dist(tmpd))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
167 my.dend<-as.dendrogram(my.hclust)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
168 tmp.res = heatmap.2(tmpm, Rowv=my.dend, Colv=T, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
169 #tmp.res = heatmap.2(tmpm, Rowv=T, Colv=T, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
170 tmpd = tmpd[rev(tmp.res$rowInd),tmp.res$colInd]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
171 write.table(tmpd, "clustered_matrix.txt", sep="\t", quote=F)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
172 heatmap.2(tmpd, Rowv=F, Colv=F, trace="n", col=rev(heat.colors(10)), breaks=seq(0,.5,by=0.05), margins=c(10,10), keysize=0.8, cexRow=0.4)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
173 dev.off()
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
174
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
175
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
176 ### Statistical Plots
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
177 dd = dist(1-cor((res$phi), method="pearson"))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
178 dend = as.dendrogram(hclust(dd, "ave"))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
179 #plot(dend)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
180
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
181 pdf("bait2bait.pdf")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
182 tmp = res$phi
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
183 colnames(tmp) = paste(colnames(res$phi), res$baitCL$Bait, sep="_")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
184
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
185 ###dd = cor(tmp[,-26]) ### This line is only for Chris' data (one bait has all zeros in the estimated parameters)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
186 dd = cor(tmp) ### This line is only for Chris' data (one bait has all zeros in the estimated parameters)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
187
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
188 write.table(dd, "bait2bait_matrix.txt", sep="\t", quote=F)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
189 heatmap.2(as.matrix(dd), trace="n", breaks=seq(-1,1,by=0.1), col=(greenred(20)), cexRow=0.7, cexCol=0.7)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
190 dev.off()
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
191
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
192 tmp = mcmc[,2]
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
193 ymax = max(tmp)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
194 ymin = min(tmp)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
195 pdf("stats.pdf", height=12, width=12)
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
196
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
197 plot(mcmc[mcmc[,4]=="G",3], type="s", xlab="Iterations", ylab="Number of Clusters", main="")
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
198 plot(mcmc[,2], type="l", xlab="Iterations", ylab="Log-Likelihood", main="", ylim=c(ymin,ymax))
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
199
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
200 dev.off()
dfa3436beb67 Uploaded
bornea
parents:
diff changeset
201