comparison mosaics/R/mosaicsFit.R @ 10:d78c3c5e8ff8 draft

Uploaded
author dongjun
date Thu, 10 Jan 2013 16:01:28 -0500
parents b6d0c6ceda2c
children
comparison
equal deleted inserted replaced
9:e41be28411b2 10:d78c3c5e8ff8
1
2 # fit MOSAiCS
3
4 setMethod(
5 f="mosaicsFit",
6 signature="BinData",
7 definition=function( object, analysisType="automatic", bgEst="automatic",
8 k=3, meanThres=NA, s=2, d=0.25, truncProb=0.999,
9 parallel=FALSE, nCore=8 )
10 {
11 # Note: users can tune parameters only regarding MOSAiCS model fitting.
12 # Note: tuning adaptive griding parameters is not supported yet.
13
14 # check options: parallel computing (optional)
15
16 if ( parallel == TRUE ) {
17 message( "Use 'parallel' package for parallel computing." )
18 if ( length(find.package('parallel',quiet=TRUE)) == 0 ) {
19 stop( "Please install 'parallel' package!" )
20 }
21 }
22
23 if ( analysisType == "automatic" ) {
24 # if "analysisType" is not specified, "analysisType" is determined by dataset
25
26 if ( length(object@input)==0 ) {
27 # we don't have input: we should have both of M and GC
28
29 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
30 #cat( "Info: one-sample analysis.\n" )
31 analysisType <- "OS"
32 } else {
33 stop( "any of control data, mappability, or GC content does not exist. Cannot proceed!\n" )
34 }
35 } else {
36 # we have input: TS if we have both M & GC; IO otherwise.
37
38 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
39 #cat( "Info: two-sample analysis (with mappability & GC content).\n" )
40 analysisType <- "TS"
41 } else {
42 #cat( "Info: two-sample analysis (Input only).\n" )
43 analysisType <- "IO"
44 }
45 }
46 } else {
47 # if "analysisType" is specified, check its validity
48
49 # error treatment: Input-only analysis is impossible if input data does not exist
50
51 if ( analysisType=="IO" & length(object@input)==0 )
52 {
53 message( "Info: two-sample analysis (Input only)." )
54 stop( "control data does not exist. Cannot proceed!\n" )
55 }
56
57 # error treatment: If M or GC does not exist, TS analysis is not available
58
59 if ( analysisType=="OS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
60 {
61 message( "Info: one-sample analysis." )
62 stop( "mappability or GC content does not exist. Cannot proceed!\n" )
63 }
64
65 # error treatment: If input data does not exist, TS analysis is not available
66
67 if ( analysisType=="TS" & length(object@input)==0 )
68 {
69 message( "Info: two-sample analysis (with mappability & GC content)." )
70 message( "Info: control data does not exist." )
71 message( "Info: one-sample analysis will be implemented instead." )
72 analysisType <- "OS"
73 }
74
75 # error treatment: If M or GC does not exist, TS analysis is not available
76
77 if ( analysisType=="TS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
78 {
79 message( "Info: two-sample analysis (with mappability & GC content)." )
80 message( "Info: mappability or GC content data does not exist." )
81 message( "Info: two-sample analysis (Input only) will be implemented instead." )
82 analysisType <- "IO"
83 }
84 }
85
86 # check validity of "bgEst"
87
88 if ( bgEst == "automatic" ) {
89 message( "Info: background estimation method is determined based on data." )
90
91 Y_freq <- table( object@tagCount )
92
93 if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ]) / sum(Y_freq) > 0.5 ) {
94 message( "Info: background estimation based on bins with low tag counts." )
95 bgEst <- "matchLow"
96 } else {
97 message( "Info: background estimation based on robust method of moment" )
98 bgEst <- "rMOM"
99 }
100 } else if ( bgEst == "matchLow" ) {
101 message( "Info: background estimation based on bins with low tag counts." )
102 } else if ( bgEst == "rMOM" ) {
103 message( "Info: background estimation based on robust method of moment." )
104 } else {
105 stop( "Incorrect specification for 'bgEst'! 'bgEst' should be one of 'matchLow', 'rMOM', or 'automatic'!" )
106 }
107
108 # default meanThres for each of "OS" & "TS"
109
110 if ( is.na(meanThres) )
111 {
112 switch( analysisType,
113 OS = {
114 meanThres <- 0
115 },
116 TS = {
117 meanThres <- 1
118 },
119 IO = {
120 meanThres <- NA # meanThres is not used for analysisType=="IO"
121 }
122 )
123 }
124
125 # MOSAiCS model fit
126
127 switch( analysisType,
128 OS = {
129 # one-sample analysis
130
131 message( "Info: one-sample analysis." )
132 fit <- .mosaicsFit_OS( object, bgEst=bgEst, k=k, meanThres=meanThres,
133 parallel=parallel, nCore=nCore )
134 },
135 TS = {
136 # two-sample analysis (with M & GC)
137
138 message( "Info: two-sample analysis (with mappability & GC content)." )
139 fit <- .mosaicsFit_TS( object, bgEst=bgEst,
140 k=k, meanThres=meanThres, s=s, d=d,
141 parallel=parallel, nCore=nCore )
142 },
143 IO = {
144 # two-sample analysis (Input only)
145
146 message( "Info: two-sample analysis (Input only)." )
147 fit <- .mosaicsFit_IO( object, bgEst=bgEst,
148 k=k, d=d, truncProb=truncProb,
149 parallel=parallel, nCore=nCore )
150 }
151 )
152
153 message( "Info: done!" )
154
155 return(fit)
156 }
157 )
158
159 # MOSAiCS one-sample analysis
160
161 .mosaicsFit_OS <- function( binData, bgEst, k=3, meanThres=0,
162 parallel=FALSE, nCore=8 )
163 {
164 message( "Info: use adaptive griding." )
165 message( "Info: fitting background model..." )
166 fitParam <- .adapGridMosaicsZ0_OS(
167 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent,
168 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50),
169 parallel=parallel, nCore=nCore )
170 fitZ0 <- .rlmFit_OS( parEst=fitParam, mean_thres=meanThres,
171 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent )
172 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
173
174 rm( fitParam )
175 gc()
176 message( "Info: done!" )
177
178 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
179 message( "Info: fitting one-signal-component model..." )
180 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
181 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
182 message( "Info: fitting two-signal-component model..." )
183 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
184 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
185
186 message( "Info: calculating BIC of fitted models..." )
187 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
188 pNfit=pNfit, k=k, model="1S", type="BIC", npar=9 )
189 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
190 pNfit=pNfit, k=k, model="2S", type="BIC", npar=12 )
191
192 mosaicsEst <- new( "MosaicsFitEst",
193 pi0=fitZ0$pi0, a=fitZ0$a,
194 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
195 b=fitZ1_1S$b, c=fitZ1_1S$c,
196 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
197 analysisType="OS" )
198
199 rm( fitZ0, fitZ1_1S, fitZ1_2S )
200 gc()
201
202 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres )
203
204 new( "MosaicsFit",
205 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
206 chrID=binData@chrID, coord=binData@coord, tagCount=binData@tagCount,
207 mappability=binData@mappability, gcContent=binData@gcContent,
208 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
209 }
210
211 # MOSAiCS two-sample analysis (with M & GC)
212
213 .mosaicsFit_TS <- function( binData, bgEst, k=3, meanThres=1, s=2, d=0.25,
214 parallel=FALSE, nCore=8 )
215 {
216 message( "Info: use adaptive griding." )
217 message( "Info: fitting background model..." )
218
219 # warning if there are insufficient # of bins with 0, 1, 2 counts in control sample
220 # -> input-only analysis is preferred.
221
222 X_freq <- table(binData@input)
223 if( sum(X_freq[ as.numeric(names(X_freq))<=2 ])/sum(X_freq) < 0.5 ) {
224 message( paste("Info: insufficient # of bins with counts <=", s, "in control sample.") )
225 message( "Info: Model fit can be unstable. Input-only analysis might be preferred." )
226 }
227
228 fitParam <- .adapGridMosaicsZ0_TS(
229 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input,
230 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), min_n_X=200,
231 parallel=parallel, nCore=nCore )
232 fitZ0 <- .rlmFit_TS( parEst=fitParam, mean_thres=meanThres, s=s, d=d,
233 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input )
234 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
235
236 rm( fitParam )
237 gc()
238 message( "Info: done!" )
239
240 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
241 message( "Info: fitting one-signal-component model..." )
242 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
243 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
244 message( "Info: fitting two-signal-component model..." )
245 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
246 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
247
248 message( "Info: calculating BIC of fitted models..." )
249 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
250 pNfit=pNfit, k=k, model="1S", type="BIC", npar=11 )
251 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
252 pNfit=pNfit, k=k, model="2S", type="BIC", npar=14 )
253
254 mosaicsEst <- new( "MosaicsFitEst",
255 pi0=fitZ0$pi0, a=fitZ0$a,
256 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
257 b=fitZ1_1S$b, c=fitZ1_1S$c,
258 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
259 analysisType="TS" )
260
261 rm( fitZ0, fitZ1_1S, fitZ1_2S )
262 gc()
263
264 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres, s=s, d=d )
265
266 new( "MosaicsFit",
267 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
268 chrID=binData@chrID, coord=binData@coord,
269 tagCount=binData@tagCount, input=binData@input,
270 mappability=binData@mappability, gcContent=binData@gcContent,
271 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
272 }
273
274 # MOSAiCS two-sample analysis (Input only)
275
276 .mosaicsFit_IO <- function( binData, bgEst, k=3, d=0.25, truncProb=0.999,
277 parallel=FALSE, nCore=8 )
278 {
279 message( "Info: use adaptive griding." )
280 message( "Info: fitting background model..." )
281
282 inputTrunc <- quantile( binData@input, truncProb )
283
284 #fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
285 # min_n_X=50 )
286 fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
287 bgEst=bgEst, inputTrunc=inputTrunc, min_n_X=50,
288 parallel=parallel, nCore=nCore )
289 fitZ0 <- .rlmFit_IO( parEst=fitParam, d=d, Y=binData@tagCount,
290 X=binData@input, inputTrunc=inputTrunc )
291 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
292
293 rm( fitParam )
294 gc()
295 message( "Info: done!" )
296
297 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
298 message( "Info: fitting one-signal-component model..." )
299 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
300 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
301 message( "Info: fitting two-signal-component model..." )
302 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
303 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
304
305 message( "Info: calculating BIC of fitted models..." )
306 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
307 pNfit=pNfit, k=k, model="1S", type="BIC", npar=6 )
308 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
309 pNfit=pNfit, k=k, model="2S", type="BIC", npar=9 )
310
311 mosaicsEst <- new( "MosaicsFitEst",
312 pi0=fitZ0$pi0, a=fitZ0$a,
313 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
314 b=fitZ1_1S$b, c=fitZ1_1S$c,
315 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
316 inputTrunc=inputTrunc, analysisType="IO" )
317
318 rm( fitZ0, fitZ1_1S, fitZ1_2S )
319 gc()
320
321 mosaicsParam <- new( "MosaicsFitParam", k=k, d=d )
322
323 new( "MosaicsFit",
324 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
325 chrID=binData@chrID, coord=binData@coord,
326 tagCount=binData@tagCount, input=binData@input,
327 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
328 }