Mercurial > repos > dongjun > mosaics
comparison mosaics/R/mosaicsFit.R @ 10:d78c3c5e8ff8 draft
Uploaded
author | dongjun |
---|---|
date | Thu, 10 Jan 2013 16:01:28 -0500 |
parents | b6d0c6ceda2c |
children |
comparison
equal
deleted
inserted
replaced
9:e41be28411b2 | 10:d78c3c5e8ff8 |
---|---|
1 | |
2 # fit MOSAiCS | |
3 | |
4 setMethod( | |
5 f="mosaicsFit", | |
6 signature="BinData", | |
7 definition=function( object, analysisType="automatic", bgEst="automatic", | |
8 k=3, meanThres=NA, s=2, d=0.25, truncProb=0.999, | |
9 parallel=FALSE, nCore=8 ) | |
10 { | |
11 # Note: users can tune parameters only regarding MOSAiCS model fitting. | |
12 # Note: tuning adaptive griding parameters is not supported yet. | |
13 | |
14 # check options: parallel computing (optional) | |
15 | |
16 if ( parallel == TRUE ) { | |
17 message( "Use 'parallel' package for parallel computing." ) | |
18 if ( length(find.package('parallel',quiet=TRUE)) == 0 ) { | |
19 stop( "Please install 'parallel' package!" ) | |
20 } | |
21 } | |
22 | |
23 if ( analysisType == "automatic" ) { | |
24 # if "analysisType" is not specified, "analysisType" is determined by dataset | |
25 | |
26 if ( length(object@input)==0 ) { | |
27 # we don't have input: we should have both of M and GC | |
28 | |
29 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) { | |
30 #cat( "Info: one-sample analysis.\n" ) | |
31 analysisType <- "OS" | |
32 } else { | |
33 stop( "any of control data, mappability, or GC content does not exist. Cannot proceed!\n" ) | |
34 } | |
35 } else { | |
36 # we have input: TS if we have both M & GC; IO otherwise. | |
37 | |
38 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) { | |
39 #cat( "Info: two-sample analysis (with mappability & GC content).\n" ) | |
40 analysisType <- "TS" | |
41 } else { | |
42 #cat( "Info: two-sample analysis (Input only).\n" ) | |
43 analysisType <- "IO" | |
44 } | |
45 } | |
46 } else { | |
47 # if "analysisType" is specified, check its validity | |
48 | |
49 # error treatment: Input-only analysis is impossible if input data does not exist | |
50 | |
51 if ( analysisType=="IO" & length(object@input)==0 ) | |
52 { | |
53 message( "Info: two-sample analysis (Input only)." ) | |
54 stop( "control data does not exist. Cannot proceed!\n" ) | |
55 } | |
56 | |
57 # error treatment: If M or GC does not exist, TS analysis is not available | |
58 | |
59 if ( analysisType=="OS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) ) | |
60 { | |
61 message( "Info: one-sample analysis." ) | |
62 stop( "mappability or GC content does not exist. Cannot proceed!\n" ) | |
63 } | |
64 | |
65 # error treatment: If input data does not exist, TS analysis is not available | |
66 | |
67 if ( analysisType=="TS" & length(object@input)==0 ) | |
68 { | |
69 message( "Info: two-sample analysis (with mappability & GC content)." ) | |
70 message( "Info: control data does not exist." ) | |
71 message( "Info: one-sample analysis will be implemented instead." ) | |
72 analysisType <- "OS" | |
73 } | |
74 | |
75 # error treatment: If M or GC does not exist, TS analysis is not available | |
76 | |
77 if ( analysisType=="TS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) ) | |
78 { | |
79 message( "Info: two-sample analysis (with mappability & GC content)." ) | |
80 message( "Info: mappability or GC content data does not exist." ) | |
81 message( "Info: two-sample analysis (Input only) will be implemented instead." ) | |
82 analysisType <- "IO" | |
83 } | |
84 } | |
85 | |
86 # check validity of "bgEst" | |
87 | |
88 if ( bgEst == "automatic" ) { | |
89 message( "Info: background estimation method is determined based on data." ) | |
90 | |
91 Y_freq <- table( object@tagCount ) | |
92 | |
93 if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ]) / sum(Y_freq) > 0.5 ) { | |
94 message( "Info: background estimation based on bins with low tag counts." ) | |
95 bgEst <- "matchLow" | |
96 } else { | |
97 message( "Info: background estimation based on robust method of moment" ) | |
98 bgEst <- "rMOM" | |
99 } | |
100 } else if ( bgEst == "matchLow" ) { | |
101 message( "Info: background estimation based on bins with low tag counts." ) | |
102 } else if ( bgEst == "rMOM" ) { | |
103 message( "Info: background estimation based on robust method of moment." ) | |
104 } else { | |
105 stop( "Incorrect specification for 'bgEst'! 'bgEst' should be one of 'matchLow', 'rMOM', or 'automatic'!" ) | |
106 } | |
107 | |
108 # default meanThres for each of "OS" & "TS" | |
109 | |
110 if ( is.na(meanThres) ) | |
111 { | |
112 switch( analysisType, | |
113 OS = { | |
114 meanThres <- 0 | |
115 }, | |
116 TS = { | |
117 meanThres <- 1 | |
118 }, | |
119 IO = { | |
120 meanThres <- NA # meanThres is not used for analysisType=="IO" | |
121 } | |
122 ) | |
123 } | |
124 | |
125 # MOSAiCS model fit | |
126 | |
127 switch( analysisType, | |
128 OS = { | |
129 # one-sample analysis | |
130 | |
131 message( "Info: one-sample analysis." ) | |
132 fit <- .mosaicsFit_OS( object, bgEst=bgEst, k=k, meanThres=meanThres, | |
133 parallel=parallel, nCore=nCore ) | |
134 }, | |
135 TS = { | |
136 # two-sample analysis (with M & GC) | |
137 | |
138 message( "Info: two-sample analysis (with mappability & GC content)." ) | |
139 fit <- .mosaicsFit_TS( object, bgEst=bgEst, | |
140 k=k, meanThres=meanThres, s=s, d=d, | |
141 parallel=parallel, nCore=nCore ) | |
142 }, | |
143 IO = { | |
144 # two-sample analysis (Input only) | |
145 | |
146 message( "Info: two-sample analysis (Input only)." ) | |
147 fit <- .mosaicsFit_IO( object, bgEst=bgEst, | |
148 k=k, d=d, truncProb=truncProb, | |
149 parallel=parallel, nCore=nCore ) | |
150 } | |
151 ) | |
152 | |
153 message( "Info: done!" ) | |
154 | |
155 return(fit) | |
156 } | |
157 ) | |
158 | |
159 # MOSAiCS one-sample analysis | |
160 | |
161 .mosaicsFit_OS <- function( binData, bgEst, k=3, meanThres=0, | |
162 parallel=FALSE, nCore=8 ) | |
163 { | |
164 message( "Info: use adaptive griding." ) | |
165 message( "Info: fitting background model..." ) | |
166 fitParam <- .adapGridMosaicsZ0_OS( | |
167 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, | |
168 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), | |
169 parallel=parallel, nCore=nCore ) | |
170 fitZ0 <- .rlmFit_OS( parEst=fitParam, mean_thres=meanThres, | |
171 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent ) | |
172 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst ) | |
173 | |
174 rm( fitParam ) | |
175 gc() | |
176 message( "Info: done!" ) | |
177 | |
178 Y_bd_all <- .calcYbdAll( fitZ0, k=k ) | |
179 message( "Info: fitting one-signal-component model..." ) | |
180 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount, | |
181 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
182 message( "Info: fitting two-signal-component model..." ) | |
183 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount, | |
184 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
185 | |
186 message( "Info: calculating BIC of fitted models..." ) | |
187 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount, | |
188 pNfit=pNfit, k=k, model="1S", type="BIC", npar=9 ) | |
189 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount, | |
190 pNfit=pNfit, k=k, model="2S", type="BIC", npar=12 ) | |
191 | |
192 mosaicsEst <- new( "MosaicsFitEst", | |
193 pi0=fitZ0$pi0, a=fitZ0$a, | |
194 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit, | |
195 b=fitZ1_1S$b, c=fitZ1_1S$c, | |
196 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2, | |
197 analysisType="OS" ) | |
198 | |
199 rm( fitZ0, fitZ1_1S, fitZ1_2S ) | |
200 gc() | |
201 | |
202 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres ) | |
203 | |
204 new( "MosaicsFit", | |
205 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam, | |
206 chrID=binData@chrID, coord=binData@coord, tagCount=binData@tagCount, | |
207 mappability=binData@mappability, gcContent=binData@gcContent, | |
208 bic1S=fitBIC_1S, bic2S=fitBIC_2S ) | |
209 } | |
210 | |
211 # MOSAiCS two-sample analysis (with M & GC) | |
212 | |
213 .mosaicsFit_TS <- function( binData, bgEst, k=3, meanThres=1, s=2, d=0.25, | |
214 parallel=FALSE, nCore=8 ) | |
215 { | |
216 message( "Info: use adaptive griding." ) | |
217 message( "Info: fitting background model..." ) | |
218 | |
219 # warning if there are insufficient # of bins with 0, 1, 2 counts in control sample | |
220 # -> input-only analysis is preferred. | |
221 | |
222 X_freq <- table(binData@input) | |
223 if( sum(X_freq[ as.numeric(names(X_freq))<=2 ])/sum(X_freq) < 0.5 ) { | |
224 message( paste("Info: insufficient # of bins with counts <=", s, "in control sample.") ) | |
225 message( "Info: Model fit can be unstable. Input-only analysis might be preferred." ) | |
226 } | |
227 | |
228 fitParam <- .adapGridMosaicsZ0_TS( | |
229 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input, | |
230 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), min_n_X=200, | |
231 parallel=parallel, nCore=nCore ) | |
232 fitZ0 <- .rlmFit_TS( parEst=fitParam, mean_thres=meanThres, s=s, d=d, | |
233 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input ) | |
234 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst ) | |
235 | |
236 rm( fitParam ) | |
237 gc() | |
238 message( "Info: done!" ) | |
239 | |
240 Y_bd_all <- .calcYbdAll( fitZ0, k=k ) | |
241 message( "Info: fitting one-signal-component model..." ) | |
242 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount, | |
243 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
244 message( "Info: fitting two-signal-component model..." ) | |
245 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount, | |
246 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
247 | |
248 message( "Info: calculating BIC of fitted models..." ) | |
249 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount, | |
250 pNfit=pNfit, k=k, model="1S", type="BIC", npar=11 ) | |
251 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount, | |
252 pNfit=pNfit, k=k, model="2S", type="BIC", npar=14 ) | |
253 | |
254 mosaicsEst <- new( "MosaicsFitEst", | |
255 pi0=fitZ0$pi0, a=fitZ0$a, | |
256 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit, | |
257 b=fitZ1_1S$b, c=fitZ1_1S$c, | |
258 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2, | |
259 analysisType="TS" ) | |
260 | |
261 rm( fitZ0, fitZ1_1S, fitZ1_2S ) | |
262 gc() | |
263 | |
264 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres, s=s, d=d ) | |
265 | |
266 new( "MosaicsFit", | |
267 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam, | |
268 chrID=binData@chrID, coord=binData@coord, | |
269 tagCount=binData@tagCount, input=binData@input, | |
270 mappability=binData@mappability, gcContent=binData@gcContent, | |
271 bic1S=fitBIC_1S, bic2S=fitBIC_2S ) | |
272 } | |
273 | |
274 # MOSAiCS two-sample analysis (Input only) | |
275 | |
276 .mosaicsFit_IO <- function( binData, bgEst, k=3, d=0.25, truncProb=0.999, | |
277 parallel=FALSE, nCore=8 ) | |
278 { | |
279 message( "Info: use adaptive griding." ) | |
280 message( "Info: fitting background model..." ) | |
281 | |
282 inputTrunc <- quantile( binData@input, truncProb ) | |
283 | |
284 #fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input, | |
285 # min_n_X=50 ) | |
286 fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input, | |
287 bgEst=bgEst, inputTrunc=inputTrunc, min_n_X=50, | |
288 parallel=parallel, nCore=nCore ) | |
289 fitZ0 <- .rlmFit_IO( parEst=fitParam, d=d, Y=binData@tagCount, | |
290 X=binData@input, inputTrunc=inputTrunc ) | |
291 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst ) | |
292 | |
293 rm( fitParam ) | |
294 gc() | |
295 message( "Info: done!" ) | |
296 | |
297 Y_bd_all <- .calcYbdAll( fitZ0, k=k ) | |
298 message( "Info: fitting one-signal-component model..." ) | |
299 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount, | |
300 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
301 message( "Info: fitting two-signal-component model..." ) | |
302 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount, | |
303 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k ) | |
304 | |
305 message( "Info: calculating BIC of fitted models..." ) | |
306 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount, | |
307 pNfit=pNfit, k=k, model="1S", type="BIC", npar=6 ) | |
308 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount, | |
309 pNfit=pNfit, k=k, model="2S", type="BIC", npar=9 ) | |
310 | |
311 mosaicsEst <- new( "MosaicsFitEst", | |
312 pi0=fitZ0$pi0, a=fitZ0$a, | |
313 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit, | |
314 b=fitZ1_1S$b, c=fitZ1_1S$c, | |
315 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2, | |
316 inputTrunc=inputTrunc, analysisType="IO" ) | |
317 | |
318 rm( fitZ0, fitZ1_1S, fitZ1_2S ) | |
319 gc() | |
320 | |
321 mosaicsParam <- new( "MosaicsFitParam", k=k, d=d ) | |
322 | |
323 new( "MosaicsFit", | |
324 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam, | |
325 chrID=binData@chrID, coord=binData@coord, | |
326 tagCount=binData@tagCount, input=binData@input, | |
327 bic1S=fitBIC_1S, bic2S=fitBIC_2S ) | |
328 } |