2
|
1
|
|
2 # fit MOSAiCS
|
|
3
|
|
4 setMethod(
|
|
5 f="mosaicsFit",
|
|
6 signature="BinData",
|
|
7 definition=function( object, analysisType="automatic", bgEst="automatic",
|
|
8 k=3, meanThres=NA, s=2, d=0.25, truncProb=0.999,
|
|
9 parallel=FALSE, nCore=8 )
|
|
10 {
|
|
11 # Note: users can tune parameters only regarding MOSAiCS model fitting.
|
|
12 # Note: tuning adaptive griding parameters is not supported yet.
|
|
13
|
|
14 # check options: parallel computing (optional)
|
|
15
|
|
16 if ( parallel == TRUE ) {
|
|
17 message( "Use 'parallel' package for parallel computing." )
|
|
18 if ( length(find.package('parallel',quiet=TRUE)) == 0 ) {
|
|
19 stop( "Please install 'parallel' package!" )
|
|
20 }
|
|
21 }
|
|
22
|
|
23 if ( analysisType == "automatic" ) {
|
|
24 # if "analysisType" is not specified, "analysisType" is determined by dataset
|
|
25
|
|
26 if ( length(object@input)==0 ) {
|
|
27 # we don't have input: we should have both of M and GC
|
|
28
|
|
29 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
|
|
30 #cat( "Info: one-sample analysis.\n" )
|
|
31 analysisType <- "OS"
|
|
32 } else {
|
|
33 stop( "any of control data, mappability, or GC content does not exist. Cannot proceed!\n" )
|
|
34 }
|
|
35 } else {
|
|
36 # we have input: TS if we have both M & GC; IO otherwise.
|
|
37
|
|
38 if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
|
|
39 #cat( "Info: two-sample analysis (with mappability & GC content).\n" )
|
|
40 analysisType <- "TS"
|
|
41 } else {
|
|
42 #cat( "Info: two-sample analysis (Input only).\n" )
|
|
43 analysisType <- "IO"
|
|
44 }
|
|
45 }
|
|
46 } else {
|
|
47 # if "analysisType" is specified, check its validity
|
|
48
|
|
49 # error treatment: Input-only analysis is impossible if input data does not exist
|
|
50
|
|
51 if ( analysisType=="IO" & length(object@input)==0 )
|
|
52 {
|
|
53 message( "Info: two-sample analysis (Input only)." )
|
|
54 stop( "control data does not exist. Cannot proceed!\n" )
|
|
55 }
|
|
56
|
|
57 # error treatment: If M or GC does not exist, TS analysis is not available
|
|
58
|
|
59 if ( analysisType=="OS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
|
|
60 {
|
|
61 message( "Info: one-sample analysis." )
|
|
62 stop( "mappability or GC content does not exist. Cannot proceed!\n" )
|
|
63 }
|
|
64
|
|
65 # error treatment: If input data does not exist, TS analysis is not available
|
|
66
|
|
67 if ( analysisType=="TS" & length(object@input)==0 )
|
|
68 {
|
|
69 message( "Info: two-sample analysis (with mappability & GC content)." )
|
|
70 message( "Info: control data does not exist." )
|
|
71 message( "Info: one-sample analysis will be implemented instead." )
|
|
72 analysisType <- "OS"
|
|
73 }
|
|
74
|
|
75 # error treatment: If M or GC does not exist, TS analysis is not available
|
|
76
|
|
77 if ( analysisType=="TS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
|
|
78 {
|
|
79 message( "Info: two-sample analysis (with mappability & GC content)." )
|
|
80 message( "Info: mappability or GC content data does not exist." )
|
|
81 message( "Info: two-sample analysis (Input only) will be implemented instead." )
|
|
82 analysisType <- "IO"
|
|
83 }
|
|
84 }
|
|
85
|
|
86 # check validity of "bgEst"
|
|
87
|
|
88 if ( bgEst == "automatic" ) {
|
|
89 message( "Info: background estimation method is determined based on data." )
|
|
90
|
|
91 Y_freq <- table( object@tagCount )
|
|
92
|
|
93 if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ]) / sum(Y_freq) > 0.5 ) {
|
|
94 message( "Info: background estimation based on bins with low tag counts." )
|
|
95 bgEst <- "matchLow"
|
|
96 } else {
|
|
97 message( "Info: background estimation based on robust method of moment" )
|
|
98 bgEst <- "rMOM"
|
|
99 }
|
|
100 } else if ( bgEst == "matchLow" ) {
|
|
101 message( "Info: background estimation based on bins with low tag counts." )
|
|
102 } else if ( bgEst == "rMOM" ) {
|
|
103 message( "Info: background estimation based on robust method of moment." )
|
|
104 } else {
|
|
105 stop( "Incorrect specification for 'bgEst'! 'bgEst' should be one of 'matchLow', 'rMOM', or 'automatic'!" )
|
|
106 }
|
|
107
|
|
108 # default meanThres for each of "OS" & "TS"
|
|
109
|
|
110 if ( is.na(meanThres) )
|
|
111 {
|
|
112 switch( analysisType,
|
|
113 OS = {
|
|
114 meanThres <- 0
|
|
115 },
|
|
116 TS = {
|
|
117 meanThres <- 1
|
|
118 },
|
|
119 IO = {
|
|
120 meanThres <- NA # meanThres is not used for analysisType=="IO"
|
|
121 }
|
|
122 )
|
|
123 }
|
|
124
|
|
125 # MOSAiCS model fit
|
|
126
|
|
127 switch( analysisType,
|
|
128 OS = {
|
|
129 # one-sample analysis
|
|
130
|
|
131 message( "Info: one-sample analysis." )
|
|
132 fit <- .mosaicsFit_OS( object, bgEst=bgEst, k=k, meanThres=meanThres,
|
|
133 parallel=parallel, nCore=nCore )
|
|
134 },
|
|
135 TS = {
|
|
136 # two-sample analysis (with M & GC)
|
|
137
|
|
138 message( "Info: two-sample analysis (with mappability & GC content)." )
|
|
139 fit <- .mosaicsFit_TS( object, bgEst=bgEst,
|
|
140 k=k, meanThres=meanThres, s=s, d=d,
|
|
141 parallel=parallel, nCore=nCore )
|
|
142 },
|
|
143 IO = {
|
|
144 # two-sample analysis (Input only)
|
|
145
|
|
146 message( "Info: two-sample analysis (Input only)." )
|
|
147 fit <- .mosaicsFit_IO( object, bgEst=bgEst,
|
|
148 k=k, d=d, truncProb=truncProb,
|
|
149 parallel=parallel, nCore=nCore )
|
|
150 }
|
|
151 )
|
|
152
|
|
153 message( "Info: done!" )
|
|
154
|
|
155 return(fit)
|
|
156 }
|
|
157 )
|
|
158
|
|
159 # MOSAiCS one-sample analysis
|
|
160
|
|
161 .mosaicsFit_OS <- function( binData, bgEst, k=3, meanThres=0,
|
|
162 parallel=FALSE, nCore=8 )
|
|
163 {
|
|
164 message( "Info: use adaptive griding." )
|
|
165 message( "Info: fitting background model..." )
|
|
166 fitParam <- .adapGridMosaicsZ0_OS(
|
|
167 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent,
|
|
168 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50),
|
|
169 parallel=parallel, nCore=nCore )
|
|
170 fitZ0 <- .rlmFit_OS( parEst=fitParam, mean_thres=meanThres,
|
|
171 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent )
|
|
172 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
|
|
173
|
|
174 rm( fitParam )
|
|
175 gc()
|
|
176 message( "Info: done!" )
|
|
177
|
|
178 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
|
|
179 message( "Info: fitting one-signal-component model..." )
|
|
180 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
|
|
181 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
182 message( "Info: fitting two-signal-component model..." )
|
|
183 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
|
|
184 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
185
|
|
186 message( "Info: calculating BIC of fitted models..." )
|
|
187 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
|
|
188 pNfit=pNfit, k=k, model="1S", type="BIC", npar=9 )
|
|
189 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
|
|
190 pNfit=pNfit, k=k, model="2S", type="BIC", npar=12 )
|
|
191
|
|
192 mosaicsEst <- new( "MosaicsFitEst",
|
|
193 pi0=fitZ0$pi0, a=fitZ0$a,
|
|
194 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
|
|
195 b=fitZ1_1S$b, c=fitZ1_1S$c,
|
|
196 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
|
|
197 analysisType="OS" )
|
|
198
|
|
199 rm( fitZ0, fitZ1_1S, fitZ1_2S )
|
|
200 gc()
|
|
201
|
|
202 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres )
|
|
203
|
|
204 new( "MosaicsFit",
|
|
205 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
|
|
206 chrID=binData@chrID, coord=binData@coord, tagCount=binData@tagCount,
|
|
207 mappability=binData@mappability, gcContent=binData@gcContent,
|
|
208 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
|
|
209 }
|
|
210
|
|
211 # MOSAiCS two-sample analysis (with M & GC)
|
|
212
|
|
213 .mosaicsFit_TS <- function( binData, bgEst, k=3, meanThres=1, s=2, d=0.25,
|
|
214 parallel=FALSE, nCore=8 )
|
|
215 {
|
|
216 message( "Info: use adaptive griding." )
|
|
217 message( "Info: fitting background model..." )
|
|
218
|
|
219 # warning if there are insufficient # of bins with 0, 1, 2 counts in control sample
|
|
220 # -> input-only analysis is preferred.
|
|
221
|
|
222 X_freq <- table(binData@input)
|
|
223 if( sum(X_freq[ as.numeric(names(X_freq))<=2 ])/sum(X_freq) < 0.5 ) {
|
|
224 message( paste("Info: insufficient # of bins with counts <=", s, "in control sample.") )
|
|
225 message( "Info: Model fit can be unstable. Input-only analysis might be preferred." )
|
|
226 }
|
|
227
|
|
228 fitParam <- .adapGridMosaicsZ0_TS(
|
|
229 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input,
|
|
230 bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), min_n_X=200,
|
|
231 parallel=parallel, nCore=nCore )
|
|
232 fitZ0 <- .rlmFit_TS( parEst=fitParam, mean_thres=meanThres, s=s, d=d,
|
|
233 Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input )
|
|
234 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
|
|
235
|
|
236 rm( fitParam )
|
|
237 gc()
|
|
238 message( "Info: done!" )
|
|
239
|
|
240 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
|
|
241 message( "Info: fitting one-signal-component model..." )
|
|
242 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
|
|
243 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
244 message( "Info: fitting two-signal-component model..." )
|
|
245 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
|
|
246 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
247
|
|
248 message( "Info: calculating BIC of fitted models..." )
|
|
249 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
|
|
250 pNfit=pNfit, k=k, model="1S", type="BIC", npar=11 )
|
|
251 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
|
|
252 pNfit=pNfit, k=k, model="2S", type="BIC", npar=14 )
|
|
253
|
|
254 mosaicsEst <- new( "MosaicsFitEst",
|
|
255 pi0=fitZ0$pi0, a=fitZ0$a,
|
|
256 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
|
|
257 b=fitZ1_1S$b, c=fitZ1_1S$c,
|
|
258 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
|
|
259 analysisType="TS" )
|
|
260
|
|
261 rm( fitZ0, fitZ1_1S, fitZ1_2S )
|
|
262 gc()
|
|
263
|
|
264 mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres, s=s, d=d )
|
|
265
|
|
266 new( "MosaicsFit",
|
|
267 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
|
|
268 chrID=binData@chrID, coord=binData@coord,
|
|
269 tagCount=binData@tagCount, input=binData@input,
|
|
270 mappability=binData@mappability, gcContent=binData@gcContent,
|
|
271 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
|
|
272 }
|
|
273
|
|
274 # MOSAiCS two-sample analysis (Input only)
|
|
275
|
|
276 .mosaicsFit_IO <- function( binData, bgEst, k=3, d=0.25, truncProb=0.999,
|
|
277 parallel=FALSE, nCore=8 )
|
|
278 {
|
|
279 message( "Info: use adaptive griding." )
|
|
280 message( "Info: fitting background model..." )
|
|
281
|
|
282 inputTrunc <- quantile( binData@input, truncProb )
|
|
283
|
|
284 #fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
|
|
285 # min_n_X=50 )
|
|
286 fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
|
|
287 bgEst=bgEst, inputTrunc=inputTrunc, min_n_X=50,
|
|
288 parallel=parallel, nCore=nCore )
|
|
289 fitZ0 <- .rlmFit_IO( parEst=fitParam, d=d, Y=binData@tagCount,
|
|
290 X=binData@input, inputTrunc=inputTrunc )
|
|
291 pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
|
|
292
|
|
293 rm( fitParam )
|
|
294 gc()
|
|
295 message( "Info: done!" )
|
|
296
|
|
297 Y_bd_all <- .calcYbdAll( fitZ0, k=k )
|
|
298 message( "Info: fitting one-signal-component model..." )
|
|
299 fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
|
|
300 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
301 message( "Info: fitting two-signal-component model..." )
|
|
302 fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
|
|
303 pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
|
|
304
|
|
305 message( "Info: calculating BIC of fitted models..." )
|
|
306 fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
|
|
307 pNfit=pNfit, k=k, model="1S", type="BIC", npar=6 )
|
|
308 fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
|
|
309 pNfit=pNfit, k=k, model="2S", type="BIC", npar=9 )
|
|
310
|
|
311 mosaicsEst <- new( "MosaicsFitEst",
|
|
312 pi0=fitZ0$pi0, a=fitZ0$a,
|
|
313 betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
|
|
314 b=fitZ1_1S$b, c=fitZ1_1S$c,
|
|
315 p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
|
|
316 inputTrunc=inputTrunc, analysisType="IO" )
|
|
317
|
|
318 rm( fitZ0, fitZ1_1S, fitZ1_2S )
|
|
319 gc()
|
|
320
|
|
321 mosaicsParam <- new( "MosaicsFitParam", k=k, d=d )
|
|
322
|
|
323 new( "MosaicsFit",
|
|
324 mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
|
|
325 chrID=binData@chrID, coord=binData@coord,
|
|
326 tagCount=binData@tagCount, input=binData@input,
|
|
327 bic1S=fitBIC_1S, bic2S=fitBIC_2S )
|
|
328 }
|