Mercurial > repos > dongjun > mosaics

--- a/mosaics.xml	Thu Jan 10 15:57:23 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,313 +0,0 @@
-<tool id="MOSAiCS" name="MOSAiCS: MOdel-based one and two Sample Analysis and inference for ChIP-Seq Data" version="2.0.0">
-
-  <description></description>
-
-  <parallelism method="basic"></parallelism>
-
-  <requirements>
-	  <requirement type="binary">R</requirement>
-  </requirements>
-
-  <command interpreter="perl">
-    mosaics_wrapper.pl
-      ## ChIP file info
-      $readFileType.chipParams.chip
-      $readFileType.chipParams.chipFileFormat
-      ## control file info
-      $readFileType.controlParams.control
-      $readFileType.controlParams.controlFileFormat
-      ## peak file info
-      $out_peak
-      $OutfileFormat
-      ## analysis type
-      IO
-      ## optional output
-      $report_summary
-      $report_gof
-      $report_exploratory
-      ## settings for model fitting and peak calling: required (FALSE, FALSE, 0.05, 200, 50, 0)
-      $readFileType.pet
-      $by_chr
-      $fdrLevel
-      $fragLen
-      $binSize
-      $capping
-      #if $fitParams.fSettingsType == "preSet"
-      ## settings for model fitting and peak calling: optional
-	BIC
-	automatic
-	0.25
-	200
-	50
-	10
-      ## setting for parallel computing
-    TRUE
-    8
-      #else
-	$fitParams.signalModel
-	$fitParams.bgEst
-	$fitParams.d
-	$fitParams.maxgap
-	$fitParams.minsize
-	$fitParams.thres
-	$fitParams.parallel
-	$fitParams.nCore
-      #end if
-  </command>
-
-  <inputs>
-  	 <conditional name="readFileType">
-		<param name="pet" type="select" label="Paired-end tag (PET) or single-end tag (SET) data">
-			<option value="FALSE">Single-end tag (SET) data</option>
-			<option value="TRUE">Paired-end tag (PET) data</option>
-		</param>
-		<when value="FALSE"> <!-- SET -->
-			<conditional name="chipParams">
-				<param name="chipFileFormat" type="select" label="Select file format for ChIP sample" help="MOSAiCS accepts aligned read files as input. MOSAiCS accepts Eland result, Eland extended, Eland export, Bowtie default, SAM, BED, and CSEM file formats for single-end tag (SET) data.">
-					<option value="eland_result">Eland result</option>
-					<option value="eland_extended">Eland extended</option>
-					<option value="eland_export">Eland export</option>
-					<option value="bowtie">Bowtie default</option>
-					<option value="sam">SAM</option>
-					<option value="bed">BED</option>
-					<option value="csem">CSEM</option>
-				</param>
-				<when value="eland_result">
-					<param name="chip" type="data" format="eland" label="Eland result file for ChIP sample"/>
-				</when>
-				<when value="eland_extended">
-					<param name="chip" type="data" format="eland" label="Eland extended file for ChIP sample"/>
-				</when>
-				<when value="eland_export">
-					<param name="chip" type="data" format="eland" label="Eland export file for ChIP sample"/>
-				</when>
-				<when value="bowtie">
-					<param name="chip" type="data" label="Bowtie default file for ChIP sample"/>
-				</when>
-				<when value="sam">
-					<param name="chip" type="data" format="sam" label="SAM file for ChIP sample"/>
-				</when>
-				<when value="bed">
-					<param name="chip" type="data" format="bed" label="BED file for ChIP sample"/>
-				</when>
-				<when value="csem">
-					<param name="chip" type="data" format="csem" label="CSEM file for ChIP sample"/>
-				</when>
-			</conditional> <!-- chipParams -->
-
-			<conditional name="controlParams">
-				<param name="controlFileFormat" type="select" label="Select file format for control sample" help="MOSAiCS accepts aligned read files as input. MOSAiCS accepts Eland result, Eland extended, Eland export, Bowtie default, SAM, BED, and CSEM file formats for single-end tag (SET) data.">
-					<option value="eland_result">Eland result</option>
-					<option value="eland_extended">Eland extended</option>
-					<option value="eland_export">Eland export</option>
-					<option value="bowtie">Bowtie default</option>
-					<option value="sam">SAM</option>
-					<option value="bed">BED</option>
-					<option value="csem">CSEM</option>
-				</param>
-				<when value="eland_result">
-					<param name="control" type="data" format="eland" label="Eland result file for control sample"/>
-				</when>
-				<when value="eland_extended">
-					<param name="control" type="data" format="eland" label="Eland extended file for control sample"/>
-				</when>
-				<when value="eland_export">
-					<param name="control" type="data" format="eland" label="Eland export file for control sample"/>
-				</when>
-				<when value="bowtie">
-					<param name="control" type="data" label="Bowtie default file for control sample"/>
-				</when>
-				<when value="sam">
-					<param name="control" type="data" format="sam" label="SAM file for control sample"/>
-				</when>
-				<when value="bed">
-					<param name="control" type="data" format="bed" label="BED file for control sample"/>
-				</when>
-				<when value="csem">
-					<param name="control" type="data" format="csem" label="CSEM file for control sample"/>
-				</when>
-			</conditional> <!-- controlParams -->
-		</when>
-		<when value="TRUE">	<!-- PET -->
-			<conditional name="chipParams">
-				<param name="chipFileFormat" type="select" label="Select file format for ChIP sample" help="MOSAiCS accepts aligned read files as input. MOSAiCS accepts Eland result and SAM file formats for paired-end tag (PET) data.">
-					<option value="eland_result">Eland result</option>
-					<option value="sam">SAM</option>
-				</param>
-				<when value="eland_result">
-					<param name="chip" type="data" format="eland" label="Eland result file for ChIP sample"/>
-				</when>
-				<when value="sam">
-					<param name="chip" type="data" format="sam" label="SAM file for ChIP sample"/>
-				</when>
-			</conditional> <!-- chipParams -->
-
-			<conditional name="controlParams">
-				<param name="controlFileFormat" type="select" label="Select file format for control sample" help="MOSAiCS accepts aligned read files as input. MOSAiCS accepts Eland result and SAM file formats for paired-end tag (PET) data.">
-					<option value="eland_result">Eland result</option>
-					<option value="sam">SAM</option>
-				</param>
-				<when value="eland_result">
-					<param name="control" type="data" format="eland" label="Eland result file for control sample"/>
-				</when>
-				<when value="sam">
-					<param name="control" type="data" format="sam" label="SAM file for control sample"/>
-				</when>
-			</conditional> <!-- controlParams -->
-		</when>
-	</conditional><!-- readFileType -->
-
-	<param name="OutfileFormat" type="select" label="Select file format for peak calling results" help="MOSAiCS can export peak calling results into BED or GFF file formats, or as a table.">
-		<option value="bed">BED</option>
-		<option value="gff">GFF</option>
-		<option value="txt">table</option>
-	</param>
-	<param name="summary" type="boolean" truevalue="1" falsevalue="0" display="checkboxes" label="Reports for diagnostics: Summary of model fitting and peak calling" />
-	<param name="gof" type="boolean" truevalue="1" falsevalue="0" display="checkboxes" label="Reports for diagnostics: Goodness of fit (GOF) plots" />
-	<param name="exploratory" type="boolean" truevalue="1" falsevalue="0" display="checkboxes" label="Reports for diagnostics: Plots of exploratory analysis" />
-
-	<param name="by_chr" type="select" label="Genome-wide analysis or chromosome-wise analysis" help="If genome-wide analysis is used, one model is fitted for all the chromosomes. If chromosome-wise analysis is used, different model is fitted for each chromosome separately." >
-		<option value="FALSE">Genome-wide analysis</option>
-		<option value="TRUE">Chromosome-wise analysis</option>
-	</param>
-	<param name="fdrLevel" type="float" value="0.05" min="0" max="1" label="False discovery rate (FDR)" help="FDR level for peak detection (default: 0.05)" />
-	<param name="fragLen" type="integer" value="200" label="Average fragment length" help="Default: 200." />
-	<param name="binSize" type="integer" value="200" label="Bin size" help="By default, bin size equals to the average fragment length." />
-	<param name="capping" type="integer" value="0" label="Maximum number of reads allowed to start at each nucleotide position" help="If non-positive value is specified (e.g., 0), any number of reads are allowed at each nucleotide position (i.e., no filtering). By default, filtering is NOT used." />
-
-	<conditional name="fitParams">
-		<param name="fSettingsType" type="select" label="Settings for model fitting and peak calling" help="For most peak calling applications, use the 'Commonly used' setting. If you want access to all parameters, use 'Full parameter list'.">
-			<option value="preSet">Commonly used</option>
-			<option value="full">Full parameter list</option>
-		</param>
-		<when value="preSet" />
-		<when value="full">
-			<param name="signalModel" type="select" label="Signal model" help="By default, signal model is chosen using BIC.">
-				<option value="BIC">Automatic model selection based on BIC</option>
-				<option value="1S">One-signal-component model</option>
-				<option value="2S">Two-signal-component model</option>
-			</param>
-			<param name="bgEst" type="select" label="Background estimation approach" help="By default, background estimation approach is automatically determined based on the data.">
-				<option value="automatic">Automatic selection based on the data</option>
-				<option value="matchLow">Based on bins with low tag counts</option>
-				<option value="rMOM">Robust method of moment (MOM)</option>
-			</param>
-			<param name="d" type="float" value="0.25" label="d" help="Parameter for estimating background distribution. Default is 0.25." />
-			<param name="maxgap" type="integer" value="200" label="maxgap" help="Initial nearby peaks are merged if the distance (in bp) between them is less than 'maxgap'. Default is 200." />
-			<param name="minsize" type="integer" value="50" label="minsize" help="An initial peak is removed if its width is narrower than 'minsize'. Default is 50." />
-			<param name="thres" type="integer" value="10" label="thres" help="A bin within initial peak is removed if its ChIP tag counts are less than 'thres'. Default is 10." />
-			<param name="parallel" type="select" label="Use parallel computing?">
-				<option value="TRUE">Use parallel computing.</option>
-				<option value="FALSE">NOT use parallel computing.</option>
-			</param>
-			<param name="nCore" type="integer" value="8" label="Number of CPUs" help="Number of CPUs used for parallel computing. Relevant only when parallel computing is used. Default is to use 8 CPUs." />
-		</when> <!-- full -->
-	</conditional> <!-- fitParams -->
-  </inputs>
-
-  <outputs>
-	<data format="tabular" name="out_peak">
-		<change_format>
-			<when input="OutfileFormat" value="bed" format="bed" />
-			<when input="OutfileFormat" value="gff" format="gff" />
-		</change_format>
-	</data>
-	<data format="txt" name="report_summary">
-		<filter>summary == 1</filter>
-	</data>
-	<data format="pdf" name="report_gof">
-		<filter>gof == 1</filter>
-	</data>
-	<data format="pdf" name="report_exploratory">
-		<filter>exploratory == 1</filter>
-	</data>
-  </outputs>
-
-  <help>
-
-**What it does**
-
-MOSAiCS is a statistical framework for the analysis of ChIP-seq data and it stands for MOdel-based one and two Sample Analysis and Inference for ChIP-Seq Data. MOSAiCS is based on a flexible parametric mixture modeling approach for detecting peaks (i.e., enriched regions).
-MOSAiCS is also available in Bioconductor_ as a R package.
-We encourage questions or requests regarding MOSAiCS to be posted on our `Google group`_.
-
-Please cite: Kuan PF, Chung D, Pan G, Thomson JA, Stewart R, and Keles S (2011), "`A statistical framework for the analysis of ChIP-Seq data`_," *Journal of the American Statistical Association*, Vol. 106, pp. 891--903.
-
-.. _Bioconductor: http://www.bioconductor.org/help/bioc-views/2.11/bioc/html/mosaics.html
-.. _Google group: http://groups.google.com/group/mosaics_user_group
-.. _A statistical framework for the analysis of ChIP-Seq data: http://pubs.amstat.org/doi/abs/10.1198/jasa.2011.ap09706
-
-------
-
-**Input formats**
-
-MOSAiCS accepts aligned read files of ChIP and control samples as input. Currently, MOSAiCS accepts Eland result, Eland extended, Eland export, Bowtie default, SAM, BED, and CSEM formats for single-end tag (SET) data. For paired-end tag (PET) data, MOSAiCS accepts Eland result and SAM formats.
-
-------
-
-**Outputs**
-
-Peak calling results of MOSAiCS can be exported into BED or GFF file formats, or as a table. Each line of the output file specifies a single peak.
-
-If the output is a table, it has the following columns::
-
-   Column    Description
-   --------  --------------------------------------------------------
-     1       Chromosome of the peak
-     2       Start position of the peak
-     3       End position of the peak
-     4       Width of the peak
-     5       Averaged posterior probability of the peak
-     6       Minimum posterior probability of the peak
-     7       Averaged ChIP tag counts of the peak
-     8       Maximum ChIP tag counts of the peak
-     9       Averaged control tag counts of the peak
-    10       Averaged control tag counts of the peak, scaled by sequencing depth
-    11       Averaged log base 2 ratio of ChIP over input tag counts
-
-If the output is in BED format, it has the following columns::
-
-    Column        Description
-    ------------  --------------------------------------------------------
-    1 chrom       Chromosome of the peak
-    2 chromStart  Start position of the peak
-    3 chromEnd    End position of the peak
-    4 name        Always "MOSAiCS_peak"
-    5 score       Averaged ChIP tag counts of the peak
-
-If the output is in GFF format, it has the following columns::
-
-    Column     Description
-    ---------  --------------------------------------------------------
-    1 seqname  Chromosome of the peak
-    2 source   Always "MOSAiCS"
-    3 feature  Always "MOSAiCS_peak"
-    4 start    Start position of the peak
-    5 end      End position of the peak
-    6 score    Averaged ChIP tag counts of the peak
-    7 strand   Always "."
-    8 frame    Always "."
-    9 group    Always "."
-
-------
-
-**Reports for diagnostics**
-
-*Summary of model fitting and peak calling*: This report provides information about input and output files, parameter settings used for model fitting and peak calling, and brief summary of peak calling results.
-
-*Goodness of fit (GOF) plots*: This report allows visual comparisons of the fits of the background, one-signal-component, and two-signal-component models with the actual data.
-
-*Plots of exploratory analysis*: This report provides the histograms of ChIP and control samples and the scatter plots of ChIP versus control tag counts.
-
-More details regarding these reports can be found here_:
-
-------
-
-**Settings for model fitting and peak calling**
-
-More details about the tuning of these parameters can be found here_:
-
-.. _here: http://www.bioconductor.org/packages/2.11/bioc/vignettes/mosaics/inst/doc/mosaics-example.pdf
-
-  </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/DESCRIPTION	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,20 @@
+Package: mosaics
+Type: Package
+Title: MOSAiCS (MOdel-based one and two Sample Analysis and Inference
+        for ChIP-Seq)
+Version: 1.5.4
+Depends: R (>= 2.11.1), methods, graphics, Rcpp
+Imports: MASS, splines, lattice, IRanges
+Suggests: mosaicsExample
+Enhances: parallel
+LinkingTo: Rcpp
+SystemRequirements: Perl
+Date: 2012-09-24
+Author: Dongjun Chung, Pei Fen Kuan, Sunduz Keles
+Maintainer: Dongjun Chung <chungdon@stat.wisc.edu>
+Description: This package provides functions for fitting MOSAiCS, a statistical framework to analyze one-sample or two-sample ChIP-seq data.
+License: GPL (>= 2)
+URL: http://groups.google.com/group/mosaics_user_group
+LazyLoad: yes
+biocViews: ChIPseq, Sequencing, Transcription, Genetics, Bioinformatics
+Packaged: 2012-09-28 05:01:28 UTC; chungdon
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/NAMESPACE	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,11 @@
+useDynLib(mosaics)
+
+import( methods, graphics, Rcpp, MASS, splines, lattice, IRanges )
+
+export( constructBins, readBins, mosaicsFit, mosaicsPeak, mosaicsRunAll, generateWig )
+
+export( show, print, plot )
+
+export( estimates )
+
+export( export )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/NEWS	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,147 @@
+Changes in version 1.5.4:
+
+    o Polish help documents of constructBins(), generateWig(), and mosaicsRunAll().
+
+    o Polish the vignette.
+
+Changes in version 1.5.3:
+
+BUG FIXES
+
+    o mosaicsRunAll(): Bug fix when byChr = TRUE.
+
+Changes in version 1.5.2:
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o constructBins(): Supports aligned read file formats for PET data (eland results and SAM formats).
+
+    o mosaicsRunAll(): Supports aligned read file formats for PET data.
+
+    o Add generateWig(): Constructs wiggle files for PET and SET data.
+
+    o Use tab separator instead of whitespaces for generateWig() and constructBins().
+
+    o Improve the vignette (case studies, example lines for input files, generateWig()).
+
+BUG FIXES
+
+    o constructBins(): Bug fix for capping and excludeChr. Fix incorrect summary when byChr = TRUE.
+
+    o mosaicsRunAll(): Bug fix for excludeChr & handling the full path for chipFile and controlFile.
+
+Changes in version 1.5.1:
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o constructBins(): Chromosome information can now be specified.
+
+    o mosaicsRunAll(): Chromosome information can now be specified.
+
+
+Changes in version 1.4.1:
+
+BUG FIXES
+
+    o constructBins(): Bug fix for the "outfileLoc" argument.
+
+    o mosaicsFit(): Minor changes in two-signal-component model fitting.
+
+    o mosaicsPeak(): No warning with the updated IRanges package.
+
+
+Changes in version 1.3.4:
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o Improve help documents for all classes and functions.
+
+BUG FIXES
+
+    o mosaicsPeak(): Correct bin size calculation when binsize=NA.
+
+Changes in version 1.3.2:
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o Simplify arguments of mosaicsRunAll(), constructBind(), and export().
+
+    o Add parallel argument in mosaicsFit().
+
+    o Extensive use of parallel processing/computing.
+
+    o Overall speed improvements in the package.
+
+    o Update the vignette.
+
+    o Use parallel package instead of multicore package.
+
+Changes in version 1.2.5:
+
+    o Correct version number in DESCRIPTION and package?mosaics.
+
+Changes in version 1.2.4:
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o Add parallel argument in readBins().
+
+    o Add parallel argument in mosaicsRunAll().
+
+BUG FIXES
+
+    o DESCRIPTION: 'multicore' package in 'Enhances' instead of 'Suggests'.
+
+Changes in version 1.2.3:
+
+NEW FEATURES
+
+    o New model for deeply sequenced ChIP-seq data.
+
+    o Genome-wide analysis of ChIP-seq data is now available.
+
+    o Supports more aligned read file formats: eland_result, eland_extended, eland_export, bowtie, SAM, BED, CSEM.
+
+    o Preprocessing of aligned read files can be done within the R environment using constructBins().
+
+    o Easier model fitting for the two sample analysis using mosaicsRunAll().
+
+    o Preprocessing and model fitting become much faster (Rcpp).
+
+    o Parallel processing/computing is now supported (multicore).
+
+SIGNIFICANT USER-VISIBLE CHANGES
+
+    o Add constructBins(): Preprocess aligned read files to bin-level files.
+
+    o Add mosaicsRunAll(): Convenient two sample analysis.
+
+    o Add bgEst argument in mosaicsFit(): Choose background estimation approach.
+
+    o Add nCore argument in readBins(): Parallel processing.
+
+    o Vignettes is now extensively updated.
+
+    o Rcpp package is required and multicore package is suggested.
+
+DEPRECATED AND DEFUNCT
+
+    o Drop chrID argument in export().
+
+BUG FIXES
+
+    o Fix mosaicsPeak() for the case that no peak is called.
+
+    o Fix export() by removing unnecessary spaces in output text files.
+
+Changes in version 1.2.0:
+
+   o BioConductor release 2.9.
+
+Changes in version 1.0.1:
+
+   o Fixes for 'plot' method of class 'binData'.
+
+Changes in version 1.0.0:
+
+   o On BioConductor (release 2.8).
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/AllClasses.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,81 @@
+
+# obtained from importing bin-level data obtained from perl codes
+
+setClass( Class="BinData",
+    representation=representation(
+        chrID="character",
+        coord="numeric",
+        tagCount="numeric",
+        mappability="numeric",
+        gcContent="numeric",
+        input="numeric",
+        dataType="character"
+    )
+)
+
+# obtained from MOSAiCS Z0 & Z1 model fit
+
+setClass( Class="MosaicsFitEst",
+    representation=representation(
+        pi0="numeric",
+        a="numeric",
+        betaEst="numeric",
+        muEst="numeric",
+        pNfit="list",
+        b="numeric",
+        c="numeric",
+        p1="numeric",
+        b1="numeric",
+        c1="numeric",
+        b2="numeric",
+        c2="numeric",
+        inputTrunc="numeric",
+        analysisType="character"
+    )
+)
+
+setClass( Class="MosaicsFitParam",
+    representation=representation(
+        k="numeric",
+        meanThres="numeric",
+        s="numeric",
+        d="numeric"
+    )
+)
+
+setClass( Class="MosaicsFit",
+    representation=representation(
+        mosaicsEst="MosaicsFitEst",
+        mosaicsParam="MosaicsFitParam",
+        chrID="character",
+        coord="numeric",
+        tagCount="numeric",
+        mappability="numeric",
+        gcContent="numeric",
+        input="numeric",
+        bic1S="numeric",
+        bic2S="numeric"
+    )
+)
+
+# obtained from final MOSAiCS peak calling
+
+setClass( Class="MosaicsPeakParam",
+    representation=representation(
+        analysisType="character",
+        signalModel="character",
+        FDR="numeric",
+        maxgap="numeric",
+        minsize="numeric",
+        thres="numeric"
+    )
+)
+
+setClass( Class="MosaicsPeak",
+    representation=representation(
+        peakList="data.frame",
+        peakParam="MosaicsPeakParam",
+        bdBin="data.frame",
+        empFDR="numeric"
+    )
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/AllGenerics.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,71 @@
+
+# generic methods for "BinData" class
+
+setGeneric( "chrID",
+    function( object, ... )
+    standardGeneric("chrID")
+)
+
+setGeneric( "coord",
+    function( object, ... )
+    standardGeneric("coord")
+)
+
+setGeneric( "tagCount",
+    function( object, ... )
+    standardGeneric("tagCount")
+)
+
+setGeneric( "mappability",
+    function( object, ... )
+    standardGeneric("mappability")
+)
+
+setGeneric( "gcContent",
+    function( object, ... )
+    standardGeneric("gcContent")
+)
+
+setGeneric( "input",
+    function( object, ... )
+    standardGeneric("input")
+)
+
+# generic methods for "MosaicsFit" class
+
+setGeneric( "mosaicsFit",
+    function( object, ... )
+    standardGeneric("mosaicsFit")
+)
+
+setGeneric( "estimates",
+    function( object, ... )
+    standardGeneric("estimates")
+)
+
+# generic methods for "MosaicsPeak" class
+
+setGeneric( "mosaicsPeak",
+    function( object, ... )
+    standardGeneric("mosaicsPeak")
+)
+
+setGeneric( "peakList",
+    function( object, ... )
+    standardGeneric("peakList")
+)
+
+setGeneric( "export",
+    function( object, ... )
+    standardGeneric("export")
+)
+
+setGeneric( "bdBin",
+    function( object, ... )
+    standardGeneric("bdBin")
+)
+
+setGeneric( "empFDR",
+    function( object, ... )
+    standardGeneric("empFDR")
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_adapGridMosaicsZ0_IO.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,63 @@
+
+#.adapGridMosaicsZ0_IO <- function( Y, X, min_n_X=50 )
+.adapGridMosaicsZ0_IO <- function( Y, X, bgEst=NA, inputTrunc, min_n_X=50,
+    parallel=parallel, nCore=nCore )
+{
+    X_u <- a_u <- b_u <- mean0_u <- var0_u <-
+        u0_u <- u1_u <- u2_u <- n_u <- ty_u <- c()
+
+    Y_freq <- table(Y)
+
+
+    # adaptive griding for X (Input)
+
+    X_set <- sort( unique(X), decreasing=TRUE )
+    ind_X_set <- rep( 0, length(X_set) )
+
+    ind_now <- 1
+    N_now <- 0
+
+    for ( i in 1:length(X_set) )
+    {
+        N_i <- length( which( X==X_set[i] ) )
+        if ( N_now <= min_n_X )
+        {
+            ind_X_set[i] <- ind_now
+            N_now <- N_now + N_i
+        } else
+        {
+            ind_now <- ind_now + 1
+            ind_X_set[i] <- ind_now
+            N_now <- N_i
+        }
+    }
+
+    X_set_new <- rep( 0, length(X_set) )
+    for ( i in 1:length(unique(ind_X_set)) )
+    {
+        X_set_new[ind_X_set==i] <- median( X_set[ind_X_set==i] )
+    }
+
+    X_new <- rep( 0, length(X) )
+    for ( i in 1:length(X_set) )
+    {
+        X_new[ X==X_set[i] ] <- X_set_new[i]
+    }
+
+
+    # background fit
+
+    par_est2 <- .mosaicsZ0( Y=Y, bgEst=bgEst, analysisType="IO",
+        X=X_new, inputTrunc=inputTrunc, Y_freq=Y_freq,
+        parallel=parallel, nCore=nCore )
+
+
+    # return object
+
+    par_est_final <- list( X_u = par_est2$X_u, a_u = par_est2$a_u, b_u = par_est2$b_u,
+        mean0_u = par_est2$mean0_u, var0_u = par_est2$var0_u,
+        u0_u = par_est2$u0_u, u1_u = par_est2$u1_u, u2_u = par_est2$u2_u,
+        n_u = par_est2$n_u, ty_u = par_est2$ty_u,
+        Y_val = as.numeric(names(table(Y))), Y_freq = table(Y) )
+    return( par_est_final )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_adapGridMosaicsZ0_OS.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,141 @@
+
+.adapGridMosaicsZ0_OS <- function( Y, M, GC, bgEst=NA,
+    min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50),
+    parallel=parallel, nCore=nCore )
+{
+    M_u <- GC_u <- a_u <- b_u <- mean0_u <- var0_u <-
+        u0_u <- u1_u <- u2_u <- n_u <- ty_u <- unitM_u <- c()
+
+    Y_freq <- table(Y)
+
+
+    # iteration of adaptive griding
+
+    for ( i in 1:length(grids_MGC) )
+    {
+        message( "Info: grid = ", grids_MGC[i] )
+
+        if ( i==1 )
+        {
+            # initialization
+
+            Y2 <- Y
+            M2 <- M
+            GC2 <- GC
+        } else
+        {
+            # check (M,GC) pair satisfying the condition
+            # if condition satisfied for (M,GC) pair,
+            # then keep results and remove (M,GC) pair
+            # otherwise, retain (M,GC) pair
+
+            Y2 <- M2 <- GC2 <- c()
+            N <- length(par_est2$M_u)
+
+            for ( j in 1:N )
+            {
+                if ( par_est2$n_u[j] < min_n_MGC )
+                {
+                    Y_sub_j <- par_est2$Y_sub_list[[j]]
+                    nj <- length(Y_sub_j)
+
+                    Y2 <- c( Y2, Y_sub_j )
+                    M2 <- c( M2, rep( par_est2$M_u[j], nj ) )
+                    GC2 <- c( GC2, rep( par_est2$GC_u[j], nj ) )
+                } else
+                {
+                    M_u <- c( M_u, par_est2$M_u[j] )
+                    GC_u <- c( GC_u, par_est2$GC_u[j] )
+                    a_u <- c( a_u, par_est2$a_u[j] )
+                    b_u <- c( b_u, par_est2$b_u[j] )
+                    mean0_u <- c( mean0_u, par_est2$mean0_u[j] )
+                    var0_u <- c( var0_u, par_est2$var0_u[j] )
+                    n_u <- c( n_u, par_est2$n_u[j] )
+                    u0_u <- c( u0_u, par_est2$u0_u[j] )
+                    u1_u <- c( u1_u, par_est2$u1_u[j] )
+                    u2_u <- c( u2_u, par_est2$u2_u[j] )
+                    ty_u <- c( ty_u, par_est2$ty_u[j] )
+                    unitM_u <- c( unitM_u, grids_MGC[(i-1)] )
+                }
+            }
+
+
+            # redefine grids
+
+            if ( length(M2) > 0 )
+            {
+                unitM <- grids_MGC[i]
+                M3 <- M2
+                GC3 <- GC2
+
+                for ( j in 1:((1/unitM)-1) )
+                {
+                    M2.j <- M2[ M2>=unitM*(j-1) & M2<unitM*j ]
+                    GC2.j <- GC2[ GC2>=unitM*(j-1) & GC2<unitM*j ]
+
+                    M3[ M2>=unitM*(j-1) & M2<unitM*j ] <- median(M2.j)
+                    GC3[ GC2>=unitM*(j-1) & GC2<unitM*j ] <- median(GC2.j)
+                }
+
+                M2.j <- M2[ M2>=unitM*((1/unitM)-1) & M2<=unitM*(1/unitM) ]
+                GC2.j <- GC2[ GC2>=unitM*((1/unitM)-1) & GC2<=unitM*(1/unitM) ]
+
+                M3[ M2>=unitM*((1/unitM)-1) & M2<=unitM*(1/unitM) ] <- median(M2.j)
+                GC3[ GC2>=unitM*((1/unitM)-1) & GC2<=unitM*(1/unitM) ] <- median(GC2.j)
+
+                M2 <- M3
+                GC2 <- GC3
+                rm( M3, GC3 )
+            }
+        }
+
+
+        # if there is no more (M,GC) pair, then stop iterations
+
+        if ( length(Y2)==0 )
+        {
+            break
+        }
+
+
+        # background fit
+
+        par_est2 <- .mosaicsZ0( Y=Y2, bgEst=bgEst, analysisType="OS",
+            M=M2, GC=GC2, Y_freq=Y_freq,
+            parallel=parallel, nCore=nCore )
+
+        smallN <- which( par_est2$n_u < min_n_MGC )
+        n.uns <- length(smallN) / length(par_est2$n_u)
+        #print( paste("percentage of (M,GC) pair with small n:",n.uns) )
+        #print( "distribution of n:" )
+        #print( quantile( par_est2$n_u, seq(0,1,0.05) ) )
+
+
+        # if it is the last iteration, keep the remaining
+
+        if ( i==length(grids_MGC) )
+        {
+            M_u <- c( M_u, par_est2$M_u )
+            GC_u <- c( GC_u, par_est2$GC_u )
+            a_u <- c( a_u, par_est2$a_u )
+            b_u <- c( b_u, par_est2$b_u )
+            mean0_u <- c( mean0_u, par_est2$mean0_u )
+            var0_u <- c( var0_u, par_est2$var0_u )
+            n_u <- c( n_u, par_est2$n_u )
+            u0_u <- c( u0_u, par_est2$u0_u )
+            u1_u <- c( u1_u, par_est2$u1_u )
+            u2_u <- c( u2_u, par_est2$u2_u )
+            ty_u <- c( ty_u, par_est2$ty_u )
+            unitM_u <- c( unitM_u, rep( grids_MGC[i], length(par_est2$M_u) ) )
+        }
+    }
+
+
+    # return object
+
+    par_est_final <- list( M_u = M_u, GC_u = GC_u, a_u = a_u, b_u = b_u,
+        mean0_u = mean0_u, var0_u = var0_u,
+        u0_u = u0_u, u1_u = u1_u, u2_u = u2_u, n_u = n_u, ty_u = ty_u,
+        Y_val = as.numeric(names(table(Y))), Y_freq = table(Y), unitM_u = unitM_u )
+    return( par_est_final )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_adapGridMosaicsZ0_TS.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,180 @@
+
+.adapGridMosaicsZ0_TS <- function( Y, M, GC, X, bgEst=NA,
+    min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), min_n_X=200,
+    parallel=parallel, nCore=nCore )
+{
+    X_u <- M_u <- GC_u <- a_u <- b_u <- mean0_u <- var0_u <-
+        u0_u <- u1_u <- u2_u <- n_u <- ty_u <- unitM_u <- c()
+
+    Y_freq <- table(Y)
+
+    # Step 1: adaptive griding for X (Input)
+
+    X_set <- sort( unique(X), decreasing=TRUE )
+    ind_X_set <- rep( 0, length(X_set) )
+
+    ind_now <- 1
+    N_now <- 0
+
+    for ( i in 1:length(X_set) )
+    {
+        N_i <- length( which( X==X_set[i] ) )
+        if ( N_now <= min_n_X )
+        {
+            ind_X_set[i] <- ind_now
+            N_now <- N_now + N_i
+        } else
+        {
+            ind_now <- ind_now + 1
+            ind_X_set[i] <- ind_now
+            N_now <- N_i
+        }
+    }
+
+    X_set_new <- rep( 0, length(X_set) )
+    for ( i in 1:length(unique(ind_X_set)) )
+    {
+        X_set_new[ind_X_set==i] <- median( X_set[ind_X_set==i] )
+    }
+
+    X_new <- rep( 0, length(X) )
+    for ( i in 1:length(X_set) )
+    {
+        X_new[ X==X_set[i] ] <- X_set_new[i]
+    }
+
+
+    # Step 2: adaptive griding for M & GC, given X
+
+    for ( i in 1:length(grids_MGC) )
+    {
+        message( "Info: grid = ", grids_MGC[i] )
+
+        if ( i==1 )
+        {
+            # initialization
+
+            Y2 <- Y
+            M2 <- M
+            GC2 <- GC
+            X2 <- X_new
+        } else
+        {
+            # check whether (M,GC) pair satisfies the condition
+            # - if condition satisfied for (M,GC) pair,
+            # then keep results and remove (M,GC) pair from re-griding
+            # - otherwise, retain (M,GC) pair for re-griding
+
+            Y2 <- M2 <- GC2 <- X2 <- c()
+            N <- length(par_est2$M_u)
+
+            for ( j in 1:N )
+            {
+                if ( par_est2$n_u[j] < min_n_MGC )
+                {
+                    Y_sub_j <- par_est2$Y_sub_list[[j]]
+                    nj <- length(Y_sub_j)
+
+                    Y2 <- c( Y2, Y_sub_j )
+                    M2 <- c( M2, rep( par_est2$M_u[j], nj ) )
+                    GC2 <- c( GC2, rep( par_est2$GC_u[j], nj ) )
+                    X2 <- c( X2, rep( par_est2$X_u[j], nj ) )
+                } else
+                {
+                    X_u <- c( X_u, par_est2$X_u[j] )
+                    M_u <- c( M_u, par_est2$M_u[j] )
+                    GC_u <- c( GC_u, par_est2$GC_u[j] )
+                    a_u <- c( a_u, par_est2$a_u[j] )
+                    b_u <- c( b_u, par_est2$b_u[j] )
+                    mean0_u <- c( mean0_u, par_est2$mean0_u[j] )
+                    var0_u <- c( var0_u, par_est2$var0_u[j] )
+                    n_u <- c( n_u, par_est2$n_u[j] )
+                    u0_u <- c( u0_u, par_est2$u0_u[j] )
+                    u1_u <- c( u1_u, par_est2$u1_u[j] )
+                    u2_u <- c( u2_u, par_est2$u2_u[j] )
+                    ty_u <- c( ty_u, par_est2$ty_u[j] )
+                    unitM_u <- c( unitM_u, grids_MGC[(i-1)] )
+                }
+            }
+
+
+            # redefine grids
+
+            if ( length(M2) > 0 )
+            {
+                unitM <- grids_MGC[i]
+                M3 <- M2
+                GC3 <- GC2
+
+                for ( j in 1:((1/unitM)-1) )
+                {
+                    M2.j <- M2[ M2>=unitM*(j-1) & M2<unitM*j ]
+                    GC2.j <- GC2[ GC2>=unitM*(j-1) & GC2<unitM*j ]
+
+                    M3[ M2>=unitM*(j-1) & M2<unitM*j ] <- median(M2.j)
+                    GC3[ GC2>=unitM*(j-1) & GC2<unitM*j ] <- median(GC2.j)
+                }
+
+                M2.j <- M2[ M2>=unitM*((1/unitM)-1) & M2<=unitM*(1/unitM) ]
+                GC2.j <- GC2[ GC2>=unitM*((1/unitM)-1) & GC2<=unitM*(1/unitM) ]
+
+                M3[ M2>=unitM*((1/unitM)-1) & M2<=unitM*(1/unitM) ] <- median(M2.j)
+                GC3[ GC2>=unitM*((1/unitM)-1) & GC2<=unitM*(1/unitM) ] <- median(GC2.j)
+
+                M2 <- M3
+                GC2 <- GC3
+                rm( M3, GC3 )
+            }
+        }
+
+
+        # if there is no more (M,GC) pair, then stop iterations
+
+        if ( length(Y2)==0 )
+        {
+            break
+        }
+
+
+        # background fit
+
+        par_est2 <- .mosaicsZ0( Y=Y2, bgEst=bgEst, analysisType="TS",
+            X=X2, M=M2, GC=GC2, Y_freq=Y_freq,
+            parallel=parallel, nCore=nCore )
+
+        smallN <- which( par_est2$n_u < min_n_MGC )
+        n.uns <- length(smallN) / length(par_est2$n_u)
+        #print( paste("percentage of (M,GC) pair with small n:",n.uns) )
+        #print( "distribution of n:" )
+        #print( quantile( par_est2$n_u, seq(0,1,0.05) ) )
+
+
+        # if it is the last iteration, keep the remaining
+
+        if ( i==length(grids_MGC) )
+        {
+            X_u <- c( X_u, par_est2$X_u )
+            M_u <- c( M_u, par_est2$M_u )
+            GC_u <- c( GC_u, par_est2$GC_u )
+            a_u <- c( a_u, par_est2$a_u )
+            b_u <- c( b_u, par_est2$b_u )
+            mean0_u <- c( mean0_u, par_est2$mean0_u )
+            var0_u <- c( var0_u, par_est2$var0_u )
+            n_u <- c( n_u, par_est2$n_u )
+            u0_u <- c( u0_u, par_est2$u0_u )
+            u1_u <- c( u1_u, par_est2$u1_u )
+            u2_u <- c( u2_u, par_est2$u2_u )
+            ty_u <- c( ty_u, par_est2$ty_u )
+            unitM_u <- c( unitM_u, rep( grids_MGC[i], length(par_est2$M_u) ) )
+        }
+    }
+
+
+    # return object
+
+    par_est_final <- list( X_u = X_u, M_u = M_u, GC_u = GC_u, a_u = a_u, b_u = b_u,
+        mean0_u = mean0_u, var0_u = var0_u,
+        u0_u = u0_u, u1_u = u1_u, u2_u = u2_u, n_u = n_u, ty_u = ty_u,
+        Y_val = as.numeric(names(table(Y))), Y_freq = table(Y), unitM_u = unitM_u )
+    return( par_est_final )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_calcModelBIC.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,73 @@
+
+.calcModelBIC <- function( fitZ1, Y, pNfit, k=3, model="2S", type="BIC", npar )
+{
+    # parameter estimates (common)
+
+    mu_est <- fitZ1$muEst
+    a <- fitZ1$a
+    pi0 <- fitZ1$pi0
+    b_est <- a / mu_est
+    PYZ0 <- pNfit$PYZ0
+
+    # choose Y>=k
+
+    id_geqk <- which(Y>=k)
+    Y_ori <- Y[id_geqk]
+    b_est <- b_est[id_geqk]
+    mu_est <- mu_est[id_geqk]
+
+    Yk <- Y_ori - k        # use only Y >= k
+    #if(length(which(Y<0))>0 ) Y[which(Y<0)] <- -1
+    Ykmax <- max(Yk)
+    #ind_ge_k <- which(Y>=0)
+
+    if ( model=="2S" )
+    {
+        # parameter estimates
+
+        p1 <- fitZ1$p1
+        b1 <- fitZ1$b1
+        c1 <- fitZ1$c1
+        b2 <- fitZ1$b2
+        c2 <- fitZ1$c2
+
+        # calculate log likelihood
+
+        #PYZ1 <- .margDistZ1_2S( Y_ori, pNfit, b1, c1, b2, c2 )
+        PYZ1 <- .margDistZ1_2S( Yk, Ykmax, pNfit, b1, c1, b2, c2 )
+        PYZ1G1 <- PYZ1$MDG1
+        PYZ1G2 <- PYZ1$MDG2
+
+        logLik0 <- log( pi0*PYZ0 + (1-pi0)*( p1*PYZ1G1 + (1-p1)*PYZ1G2) )
+        logLik1 <- sum( logLik0[!is.na(logLik0)] )
+    } else if ( model=="1S" )
+    {
+        # parameter estimates
+
+        b <- fitZ1$b
+        c <- fitZ1$c
+
+        # calculate log likelihood
+
+        #PYZ1 <- .margDistZ1_1S( Y_ori, pNfit, b, c )
+        PYZ1 <- .margDistZ1_1S( Yk, Ykmax, pNfit, b, c )
+
+        logLik0 <- log( pi0*PYZ0 + (1-pi0)*PYZ1 )
+        logLik1 <- sum( logLik0[!is.na(logLik0)] )
+    }
+
+    # calculate BIC
+
+    switch( type,
+        "AIC" = {
+            penalty <- 2
+        },
+        "BIC" = {
+            n <- length(Y_ori)
+            penalty <- log(n)
+        }
+    )
+    val <- -2 * logLik1 + penalty * npar
+
+    return(val)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_calcPN.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,43 @@
+
+# calculate P(Y|Z=1) for the current parameters
+
+.calcPN <- function( Y, k, a, mu_est )
+{
+    b_est <- a / mu_est
+
+    # parameters for Y>=k
+
+    id_geqk <- which(Y>=k)
+
+    Yori <- Y[id_geqk]
+    b_est <- b_est[id_geqk]
+    mu_est <- mu_est[id_geqk]
+
+    # P( Y | Z0 )
+
+    PYZ0 <- dnbinom( Yori, a, b_est / (b_est+1) )
+
+    # process Y
+
+    Y <- Yori - k        # use only Y >= k
+    # if( length(which(Y<0)) > 0 ) Y[which(Y<0)] <- -1
+
+    # round mu
+
+    mu_round <- round(mu_est,2)
+    #if ( length(which(Y<0)) > 0 ) mu_round[which(Y<0)] <- 0
+    mu_round_U <- unique(mu_round)
+
+    # prob of N (using rounding mu for prob of S)
+
+    YmaxVec <- 0:max(Y)
+    b_round <- a / mu_round_U
+    pN <- apply( as.matrix(b_round), 1,
+        function(x) {
+            return( dnbinom( YmaxVec, a, x/(x+1) ) )
+        }
+    )
+    pN <- t(pN)
+
+    return( list( k=k, pN=pN, mu_round=mu_round, mu_round_U=mu_round_U, PYZ0=PYZ0 ) )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_calcYbdAll.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,50 @@
+.calcYbdAll <- function( MOSAiCS_Z0, k ) {
+
+    a <- MOSAiCS_Z0$a
+    mu_est <- MOSAiCS_Z0$muEst
+    b_est <- a / mu_est
+    pi0 <- MOSAiCS_Z0$pi0
+    tab_Y <- MOSAiCS_Z0$Y_freq
+    Y_u <- MOSAiCS_Z0$Y_val
+
+    b_estt <- table(b_est)
+    b_estv <- as.numeric(names(b_estt))
+    b_estn <- as.numeric(b_estt)
+
+    Eyz0 <- rep(0,length(Y_u))                  # N_Z0(y)
+    #Eyz0[1] <- round(sum(dnbinom(Y_u[1],a,b_est/(b_est+1))))
+    Eyz0[1] <- round(sum( b_estn * dnbinom(Y_u[1],a,b_estv/(b_estv+1)) ))
+
+    i <- 2
+    while( Eyz0[i-1]>0 & i<=length(Y_u) )
+    {
+        Eyz0[i] <- round(sum( b_estn * dnbinom(Y_u[i],a,b_estv/(b_estv+1)) ))
+        i <- i + 1
+    }
+
+    Y_Z1_tmp <- tab_Y - pi0*Eyz0
+        # (1-pi0) * N_Z1(y) = N(y) - pi0 * N_Z0(y)
+    Y_Z1_tmp[which(Y_Z1_tmp<0)] <- 0
+    Y_bd <- cbind( Y_u[-c(1:3)]-k, Y_Z1_tmp[-c(1:3)] )
+        ### already adjusted for k
+
+    if ( sum(Y_bd[,2])==0 ) {
+        # nothing left after background are excluded from the data
+        # risk level = extremely high
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    } else if ( Y_bd[which.max(Y_bd[,2]),2] / sum(Y_bd[,2]) >= 0.90 ) {
+        # almost nothing left after background are excluded from the data
+        # risk level = very high
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    } else if ( Y_bd[which.max(Y_bd[,2]),2] / sum(Y_bd[,2]) >= 0.80 ) {
+        # still not much left after background are excluded from the data
+        # rick level = medium high
+        warning( "there is some possibility of background over-estimation.\n" )
+        warning( "parameter tuning might provide better fits.\n" )
+        Y_bd_all <- rep(Y_bd[,1],Y_bd[,2])
+    } else {
+        Y_bd_all <- rep(Y_bd[,1],Y_bd[,2])
+    }
+
+    return( Y_bd_all )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_computeStat.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,53 @@
+
+#########################################################
+# Compute mean/variance for sequenced samples for
+# exploratory analysis
+#########################################################
+
+.computeStat <- function( Y, S )
+{
+    # construct unique M/GC pair
+
+    tmp <- cbind( Y, S )
+    tempdf <- within( as.data.frame(tmp), {S <- factor(S)} )
+    ySubList <- with( tempdf, split(Y, factor(S)) )
+
+    # calculate strata-specific parameters
+
+    yStatList <- lapply( ySubList, function(Y) {
+            Y <- as.numeric( as.vector(Y) )
+
+            nitem <- length(Y)
+            nitemgeq0 <- length( Y[Y!=0] )
+            p0 <- length(which( Y==0 )) / length(Y)
+            p1 <- length(which( Y==1 )) / length(Y)
+            meanYgeq0 <- mean( Y[Y!=0] )
+            varYgeq0 <- var( Y[Y!=0] )
+            medYgeq0 <- median( Y[Y!=0] )
+            meanYall <- mean(Y)
+            varYall <- var(Y)
+
+            result <- c( nitem, nitemgeq0, p0, p1,
+                meanYgeq0, varYgeq0, medYgeq0, meanYall, varYall )
+
+            return( result )
+        }
+    )
+    yStatMat <- matrix( unlist(yStatList), ncol = 9, byrow = TRUE )
+
+    # construct summary
+
+    yStats <- list( uS = as.numeric(as.vector(names(ySubList))),
+        nitem = as.numeric(as.vector(yStatMat[, 1])),
+        nitemgeq0 = as.numeric(as.vector(yStatMat[, 2])),
+        p0 = as.numeric(as.vector(yStatMat[, 3])),
+        p1 = as.numeric(as.vector(yStatMat[, 4])),
+        meanYgeq0 = as.numeric(as.vector(yStatMat[, 5])),
+        varYgeq0 = as.numeric(as.vector(yStatMat[, 6])),
+        medYgeq0 = as.numeric(as.vector(yStatMat[, 7])),
+        meanYall = as.numeric(as.vector(yStatMat[, 8])),
+        varYall = as.numeric(as.vector(yStatMat[, 9]))
+    )
+
+    return(yStats)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_convolution_1S_cpp.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,5 @@
+
+conv_1S <- function( y, mu_round, mu_round_U, pN, pS ) {
+	conv <- .Call( "convolution_1S_cpp", y, mu_round, mu_round_U, pN, pS, PACKAGE="mosaics" )
+	return(conv)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_convolution_2S_cpp.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,5 @@
+
+conv_2S <- function( y, mu_round, mu_round_U, pN, pS1, pS2 ) {
+	conv <- .Call( "convolution_2S_cpp", y, mu_round, mu_round_U, pN, pS1, pS2, PACKAGE="mosaics" )
+	return(conv)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_export.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,70 @@
+
+# export all possible info
+
+.exportTXT <- function( peakList, filename )
+{
+    outFormat <- peakList
+
+    # variable names
+
+    cat( file=filename )
+    cat( as.character(colnames(outFormat)), file=filename, sep="\t", append=TRUE )
+    cat( "\n", file=filename, append=TRUE )
+
+    # peak list
+
+    for ( i in 1:nrow(outFormat) )
+    {
+        cat( as.character(outFormat[i,]), file=filename, sep="\t", append=TRUE )
+        cat( "\n", file=filename, append=TRUE )
+    }
+
+    message( "Info: peak file was exported in TXT format:" )
+    message( "Info: file name = ", filename )
+}
+
+# GFF format (score = peak count)
+
+.exportGFF <- function( peakList, filename )
+{
+    # GFF: seqname, source, feature, start, end, score, strand, frame, group
+
+    outFormat <- data.frame( peakList$chrID, "MOSAiCS", "MOSAiCS_peak",
+        peakList$peakStart, peakList$peakStop, peakList$aveChipCount,
+        ".", ".", ".", stringsAsFactors=FALSE )
+
+    line0 <- 'track name=mosaicsPeaks description=\"MOSAiCS peaks\"'
+    cat( as.character(line0), "\n", file=filename )
+    for ( i in 1:nrow(outFormat) )
+    {
+        cat( as.character(outFormat[i,]), file=filename, sep="\t", append=TRUE )
+        cat( "\n", file=filename, append=TRUE )
+    }
+
+    message( "Info: peak file was exported in GFF format:" )
+    message( "Info: file name = ", filename )
+}
+
+# BED format (score = peak count)
+
+.exportBED <- function( peakList, filename )
+{
+    # BED: (required) chrom, chromStart, chromEnd
+    # BED: (optional) name, score, strand, thickStart, thinkEnd, itemRgb,
+    #                   blockCount, blockSizes, blockStarts
+
+    outFormat <- data.frame( peakList$chrID,
+        peakList$peakStart, peakList$peakStop, "MOSAiCS_peak",
+        peakList$aveChipCount, stringsAsFactors=FALSE )
+
+    line0 <- 'track name=mosaicsPeaks description=\"MOSAiCS peaks\" useScore=1'
+    cat( as.character(line0), "\n", file=filename )
+    for ( i in 1:nrow(outFormat) )
+    {
+        cat( as.character(outFormat[i,]), file=filename, sep="\t", append=TRUE )
+        cat( "\n", file=filename, append=TRUE )
+    }
+
+    message( "Info: peak file was exported in BED format:" )
+    message( "Info: file name = ", filename )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_getParamZ0.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,100 @@
+
+.getParamZ0 <- function( Y, bgEst=NA, Y_freq ) {
+
+    # encode ty to return numeric vector
+    # Q -> 1, MM -> 2, NA -> 3
+
+    u0 <- length(which(Y==0))
+    u1 <- length(which(Y==1))
+    u2 <- length(which(Y==2))
+
+    #print( (u0+u1+u2)/length(Y) )
+
+    # determine background estimation method
+
+    if ( u0>0 & u1>0 & u2>0 ) {
+        # if we have all of 0, 1, 2, then consider "bgEst"
+
+        if ( bgEst=="matchLow" ) {
+            estMethod <- "matchLow"
+        } else if ( bgEst=="rMOM" ) {
+            estMethod <- "rMOM"
+        } else {
+            # the case that "bgEst" is not specified:
+            # if "bgEst" not specified & bins w/ 0, 1, 2 is more than 50%, then use "low"
+            # otherwise, use "high"
+
+            #if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ]) / sum(Y_freq) > 0.5 ) {
+            #    estMethod <- "matchLow"
+            #} else {
+            #    estMethod <- "rMOM"
+            #}
+        }
+    } else {
+        # if more than one of 0, 1, 2 are missing, then use only "high"
+
+        estMethod <- "rMOM"
+    }
+
+    # estimate parameters
+
+    if ( estMethod == "matchLow" ) {
+
+        # matching 0, 1, 2 counts
+
+        #print("matching 0, 1, 2 counts")
+
+        r1 <- u1/u0
+        r2 <- u2/u1
+        a <- r1/(2*r2-r1)
+        b <- 1/(2*r2-r1)-1
+        #ty <- 'Q'
+        ty <- 1
+
+        mean0 <- mean(Y)
+        var0 <- var(Y)
+
+        if( a<=0 || b<=0 || a==Inf || b==Inf ) {
+            # MOM, if a & b estimates based non 0, 1, 2 counts are improper
+            # - use all observations, assuming that pi0>=1.
+
+            if( !is.na(var0) && !is.na(mean0) && var0 > mean0 ) {
+                # use MOM only when we have proper mean and variance
+
+                a <- mean0^2/(var0-mean0)
+                b <- mean0/(var0-mean0)
+                #ty <- 'MM'
+                ty <- 2
+            } else {
+                a <- NA
+                b <- NA
+                #ty <- NA
+                ty <- 3
+            }
+        }
+    } else if ( estMethod == "rMOM" ) {
+        # MOM (mosaics, ver 1.0.2): using robust statistics
+        # (mosaics, ver 1.0.6): median and MAD for estimates of mean and variance
+
+        #print( "rMOM" )
+
+        mean0 <- median(Y)
+        var0 <- ( mad(Y) )^2
+
+        if( !is.na(var0) && !is.na(mean0) &&  var0 > mean0 ) {
+            # use MOM only when we have proper mean and variance
+
+            a <- mean0^2 / (var0-mean0)
+            b <- mean0 / (var0-mean0)
+            #ty <- 'MM'
+            ty <- 2
+        } else {
+            a <- NA
+            b <- NA
+            #ty <- NA
+            ty <- 3
+        }
+    }
+
+    return( c( a, b, mean0, var0, u0, u1, u2, length(Y), ty ) )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_getPi0.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,175 @@
+
+.getPi0 <- function( mu, a, Y_freq )
+{
+    # generate simulated tag counts
+
+    b <- a/mu
+    N <- length(b)
+
+    # calculate pi0
+
+    if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ])/sum(Y_freq) > 0.5 ) {
+        # if 0, 1, 2 counts are more than 50%, use 0, 1, 2 counts
+
+        #print("pi0 estimation based on 0,1,2 counts")
+
+        py0z0 <- dnbinom(0,a,b/(b+1))
+        py1z0 <- dnbinom(1,a,b/(b+1))
+        py2z0 <- dnbinom(2,a,b/(b+1))
+
+        denom_pi0 <- sum(py0z0) + sum(py1z0) + sum(py2z0)
+        num_pi0 <- sum( Y_freq[1:3] )
+
+        pi0 <- min( num_pi0/denom_pi0, 0.99 )
+    } else {
+        # otherwise, estimate pi0 by searching from mode of Y
+        # until pi0 value converges or 90% bins are used
+
+        # For robust estimation of modes, density estimation of ChIP tag counts is used
+
+        #print("pi0 estimation based on Ymode")
+
+        #Ymode <- as.numeric( names(Y_freq)[ which.max(Y_freq) ] )
+        density_chip <- density( rep( as.numeric(names(Y_freq)), Y_freq ) )
+        mode_smooth <- density_chip$x[ which.max(density_chip$y) ][1]
+        mode_cand <- ( as.numeric(names(Y_freq))[ order( Y_freq, decreasing=TRUE ) ] )[1:10]
+        Ymode <- mode_cand[ which.min( abs( mode_cand - mode_smooth ) ) ][1]
+
+        if ( Ymode < 1 ) { Ymode <- as.numeric(names(Y_freq))[2] }
+
+        #print( paste("mode of Y:",Ymode) )
+        #print( paste("ratio Y<=Ymode:",
+        #   length(which( rep( as.numeric(names(Y_freq)), Y_freq ) <= Ymode )) / sum(Y_freq) ))
+        #print( paste("ratio Y>Ymode:",
+        #   length(which( rep( as.numeric(names(Y_freq)), Y_freq ) > Ymode )) / sum(Y_freq) ))
+
+        # estimation of pi0
+
+        pi0est <- 1 # corresponding to width = -1
+        cumFreq <- 0
+
+        denom_pi0 <- sum( dnbinom( Ymode, a, b/(b+1) ) )
+        num_pi0 <- Y_freq[ names(Y_freq)==Ymode ]
+        cumFreq <- Y_freq[ names(Y_freq)==Ymode ]
+                # corresponding to width = 0
+
+        pi0est <- c( pi0est, num_pi0/denom_pi0 )
+
+        #for ( i in 1:width ) {
+        #   denom_pi0 <- denom_pi0 + sum( dnbinom( (Ymode-i), a, b/(b+1) ) )
+        #   denom_pi0 <- denom_pi0 + sum( dnbinom( (Ymode+i), a, b/(b+1) ) )
+        #
+        #   num_pi0 <- num_pi0 + Y_freq[ names(Y_freq)==(Ymode-i) ]
+        #   num_pi0 <- num_pi0 + Y_freq[ names(Y_freq)==(Ymode+i) ]
+        #
+        #   pi0est <- c( pi0est, num_pi0/denom_pi0 )
+        #   cumFreq <- cumFreq + num_pi0
+        #
+        #   print(i)
+        #   print(cumFreq/sum(Y_freq))
+        #}
+
+        Y_Qall <- quantile( rep( as.numeric(names(Y_freq)), Y_freq ) )
+        Y_IQR <- Y_Qall[ names(Y_Qall)=="75%" ] - Y_Qall[ names(Y_Qall)=="25%" ]
+        Y_Q3 <- Y_Qall[ names(Y_Qall)=="75%" ]
+
+        width <- 0
+        #while ( abs(pi0est[width+2]-pi0est[width+1]) > 1e-6 & cumFreq/sum(Y_freq) <= 0.90 ) {
+        #while ( abs(pi0est[width+2]-pi0est[width+1]) > 1e-6 ) {
+        #while ( cumFreq/sum(Y_freq) <= 0.99 ) {
+        while ( width <= 2*Y_IQR ) {
+            # update pi0 valuee by extending width
+            # -> to avoid the case that pi0 estimate is based on too many bins
+
+            width <- width + 1
+
+            if ( Ymode - width > 0 ) {
+                # do not use bins with tag count 0
+
+                denom_pi0 <- denom_pi0 + sum( dnbinom( (Ymode-width), a, b/(b+1) ) )
+            }
+            denom_pi0 <- denom_pi0 + sum( dnbinom( (Ymode+width), a, b/(b+1) ) )
+
+            if ( length(which(names(Y_freq)==(Ymode-width))) > 0 & Ymode - width > 0 ) {
+                # do not use bins with tag count 0
+
+                num_pi0 <- num_pi0 + Y_freq[ names(Y_freq)==(Ymode-width) ]
+                cumFreq <- cumFreq + Y_freq[ names(Y_freq)==(Ymode-width) ]
+            }
+            if ( length(which(names(Y_freq)==(Ymode+width))) > 0 ) {
+                num_pi0 <- num_pi0 + Y_freq[ names(Y_freq)==(Ymode+width) ]
+                cumFreq <- cumFreq + Y_freq[ names(Y_freq)==(Ymode+width) ]
+            }
+
+            pi0est <- c( pi0est, num_pi0/denom_pi0 )
+
+            #print(i)
+            #print(num_pi0/denom_pi0)
+            #print(cumFreq/sum(Y_freq))
+
+            if ( abs(pi0est[width+2]-pi0est[width+1]) <= 1e-6 ) {
+                # stop search if pi0 value converges
+
+                #print( "pi0 value converges" )
+
+                pi0 <- min( num_pi0/denom_pi0, 0.99 )
+                width_final <- width
+                break
+            }
+        }
+        #if ( cumFreq/sum(Y_freq) > 0.99 ) {
+        if ( width > 2*Y_IQR ) {
+            # if pi0 value does not converge, then choose minimum pi0 value
+            # (for robust estimation of pi0, LOESS is applied)
+
+            #print( "minimum pi0 value is used" )
+
+            # LOESS smoothing of pi0 estimates
+
+            pi0_smooth <- loess.smooth( c((-1),0:width), pi0est )
+
+            # restrict lower bound of width: Q3 - median
+            # -> not trust pi0 estimate based on narrow width
+
+            pi0_smooth$y <- pi0_smooth$y[ pi0_smooth$x > (Y_Q3 - Ymode) ]
+            pi0_smooth$x <- pi0_smooth$x[ pi0_smooth$x > (Y_Q3 - Ymode) ]
+
+            # determine minimum pi0 among candidate values
+
+            width_final <- pi0_smooth$x[ which.min(pi0_smooth$y) ]
+            pi0 <- pi0est[ which.min( abs( c((-1),0:width) - width_final ) ) ]
+            pi0 <- min( pi0, 0.99 )
+            #print(which.min(pi0_smooth$y))
+            #print(pi0est)
+        }
+        #print( paste("pi0:", pi0) )
+        #print( paste("width:", width_final) )
+
+        #par( mfrow=c(2,1) )
+
+        #hist( rep( as.numeric(names(Y_freq)), Y_freq ), nclass=10000, xlim=c(0,500) )
+        #abline( v=Ymode, col="red" )
+        #abline( v=Ymode-width_final, col="red", lty=2 )
+        #abline( v=Ymode+width_final, col="red", lty=2 )
+        #abline( v=Ymode+(Y_Q3-Ymode), col="blue" )
+        #abline( v=Ymode+2*Y_IQR, col="blue" )
+
+        #plot( c((-1),0:width), pi0est, type="l", xlab="Width", ylab="pi0",
+        #   main=paste("pi0:",pi0) )
+        #lines( loess.smooth( c((-1),0:width), pi0est ), col="blue" )
+        #abline( v=width_final, col="red" )
+        #abline( v=(Y_Q3-Ymode), col="blue" )
+        #abline( v=2*Y_IQR, col="blue" )
+        #abline( h=pi0, col="red" )
+
+        #par( mfrow=c(1,1) )
+    }
+
+    #pi0 <- min( num_pi0/denom_pi0, 0.99 )
+    if( pi0 > 0.99 ) {
+        Y_sim <- rnbinom(N,a,b/(b+1))
+        pi0 <- min( num_pi0/length(which(Y_sim<=2)), 0.99 )
+    }
+
+    return( pi0 )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_margDist_1S.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,60 @@
+#########################################################
+# Compute marginal density for Y=N+S1+S2
+#########################################################
+
+.margDist_1S <- function( mosaicsEst, tagCount, pNfit, k=3 )
+{
+    # extract parameters
+
+    a <- mosaicsEst@a
+    mu_est <- mosaicsEst@muEst
+    b_est <- a / mu_est
+
+    b <- mosaicsEst@b
+    c <- mosaicsEst@c
+
+    Yori <- tagCount
+
+
+    # use only Y >= k
+
+    Y <- Yori - k
+    Y[which(Y<0)] = -1
+
+
+    # round mu
+
+    mu_round <- round(mu_est,2)
+    if( length(which(Y<0))>0 ) mu_round[which(Y<0)] <- 0
+    mu_round_U <- unique(mu_round)
+    #n_mu_U <- length(mu_round_U)
+
+
+    # prob of N using rounding mu for prob of S
+
+    Ymax <- max(Y)
+    pN <- pNfit$pN
+
+
+    # prob of N (Yori & b_est have same length)
+
+    MDZ0 = dnbinom( Yori, size=a, b_est/(b_est+1) )
+
+
+    # prob of S (positive only when Y>=k)
+
+    pS <- dnbinom( 0:Ymax, b, c/(c+1) )
+
+    id_Y_ge_k <- which( Y>=0 )
+    #n_Y_ge_k <- length( id_Y_ge_k )
+    MDZ1 <- rep( 0, length(Y) )
+    MDZ1[ id_Y_ge_k ] <- conv_1S( y=Y[id_Y_ge_k], mu_round=mu_round[id_Y_ge_k],
+        mu_round_U=mu_round_U, pN=pN, pS=pS )
+
+
+    # return object
+
+    MD=data.frame( cbind( MDZ0, MDZ1 ) )
+    colnames(MD)=c('MDZ0','MDZ1')
+    return(MD)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_margDist_2S.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,66 @@
+#########################################################
+# Compute marginal density for Y=N+S1+S2
+#########################################################
+
+.margDist_2S <- function( mosaicsEst, tagCount, pNfit, k=3 )
+{
+    # extract parameters
+
+    a <- mosaicsEst@a
+    mu_est <- mosaicsEst@muEst
+    b_est <- a / mu_est
+
+    b1 <- mosaicsEst@b1
+    c1 <- mosaicsEst@c1
+    b2 <- mosaicsEst@b2
+    c2 <- mosaicsEst@c2
+
+    Yori <- tagCount
+
+
+    # use only Y >= k
+
+    Y = Yori - k
+    Y[which(Y<0)] = -1
+
+
+    # round mu to the nearest hundredth
+
+    mu_round <- round(mu_est,2)
+    if( length(which(Y<0))>0 ) mu_round[which(Y<0)] <- 0
+    mu_round_U <- unique(mu_round)
+    #n_mu_U <- length(mu_round_U)
+
+
+    # prob of N using rounding mu for prob of S1 & S2
+
+    Ymax <- max(Y)
+    pN <- pNfit$pN
+
+
+    # prob of N (Yori & b_est have same length)
+
+    MDZ0 = dnbinom( Yori, size=a, b_est/(b_est+1) )
+
+
+    # prob of S1 & S2 (positive only when Y>=k)
+
+    pS1 = dnbinom( 0:Ymax, b1, c1/(c1+1) )
+    pS2 = dnbinom( 0:Ymax, b2, c2/(c2+1) )
+
+    id_Y_ge_k <- which(Y>=0)
+    MDG <- matrix( 0, length(Y), 2 )
+    MDGfit <- conv_2S( y=Y[id_Y_ge_k], mu_round=mu_round[id_Y_ge_k],
+        mu_round_U=mu_round_U, pN=pN, pS1=pS1, pS2=pS2 )
+    MDGfit <- matrix( MDGfit, nrow=2 )
+    MDG[ id_Y_ge_k, ] <- t(MDGfit)
+    MDZ1 <- MDG[,1]
+    MDZ2 <- MDG[,2]
+
+
+    # return object
+
+    MD=data.frame( cbind( MDZ0, MDZ1, MDZ2 ) )
+    colnames(MD)=c('MDZ0','MDZ1','MDZ2')
+    return(MD)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_mosaicsZ0.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,87 @@
+
+#.mosaicsZ0 <- function( Y, analysisType=NA, X=NA, M=NA, GC=NA, truncProb=0.9999, Y_freq )
+.mosaicsZ0 <- function( Y, bgEst="automatic", analysisType=NA,
+    X=NA, M=NA, GC=NA, inputTrunc=NA, Y_freq,
+    parallel=parallel, nCore=nCore )
+{
+    #library(MASS)
+
+    # truncate X for input only analysis
+
+    if ( analysisType=="IO" ) {
+        #trunc <- quantile( X, prob=truncProb )
+        #X[which(X>trunc)] <- trunc
+        X[which(X>inputTrunc)] <- inputTrunc
+    }
+
+    # construct unique M/GC pair
+
+    switch( analysisType,
+        OS = { S1 <- paste( M, GC, sep = ':' ) },
+        TS = { S1 <- paste( X, M, GC, sep = ':' ) },
+        IO = { S1 <- X }
+    )
+    #tmp <- cbind( Y, S1 )
+    #tempdf <- within( as.data.frame(tmp), {S1 <- factor(S1)} )
+    tmp <- data.frame( Y, as.factor(S1) )
+    ySubList <- with( tmp, split(Y, S1) )
+    #ySubList <- split( Y, S1 )
+
+    # construct strata index
+
+    if ( analysisType=="IO" ) {
+        indStrata <- as.numeric(as.vector(names(ySubList)))
+    } else {
+        indStrata <- apply( as.matrix(names(ySubList)), 1,
+            function(x){as.numeric(strsplit(x, ":")[[1]])} )
+        indStrata <- t(indStrata)
+    }
+
+    # calculate strata-specific parameters
+
+    if ( parallel ) {
+        yParamList <- mclapply( ySubList,
+            function(x) .getParamZ0( x, bgEst=bgEst, Y_freq=Y_freq ),
+            mc.cores=nCore )
+    } else {
+        yParamList <- lapply( ySubList,
+            function(x) .getParamZ0( x, bgEst=bgEst, Y_freq=Y_freq ) )
+    }
+    yParamMat <- matrix( unlist(yParamList), ncol = 9, byrow = TRUE )
+
+    # decode ty
+
+    ty <- rep( "", nrow(yParamMat) )
+    ty[ yParamMat[, 9]==1 ] <- "Q"
+    ty[ yParamMat[, 9]==2 ] <- "MM"
+    ty[ yParamMat[, 9]==3 ] <- NA
+
+    # construct common summary
+
+    parEstZ0 <- list(
+        a_u = yParamMat[, 1], b_u = yParamMat[, 2],
+        mean0_u = yParamMat[, 3], var0_u = yParamMat[, 4],
+        u0_u = yParamMat[, 5], u1_u = yParamMat[, 6], u2_u = yParamMat[, 7],
+        n_u = yParamMat[, 8], ty_u = ty,
+        Y_val = as.numeric(names(table(Y))), Y_freq = table(Y), Y_sub_list=ySubList )
+
+    # constuct analysis type specific summary
+
+    switch( analysisType,
+        OS = {
+            parEstZ0$M_u <- indStrata[,1,drop=FALSE]
+            parEstZ0$GC_u <- indStrata[,2,drop=FALSE]
+        },
+        TS = {
+            parEstZ0$X_u <- indStrata[,1,drop=FALSE]
+            parEstZ0$M_u <- indStrata[,2,drop=FALSE]
+            parEstZ0$GC_u <- indStrata[,3,drop=FALSE]
+        },
+        IO = {
+            parEstZ0$X_u <- indStrata
+            #parEstZ0$trunc <- trunc
+        }
+    )
+
+    return(parEstZ0)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_mosaicsZ1_1S.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,196 @@
+
+#########################################################
+# Z1 with 1 signal component
+#########################################################
+
+.mosaicsZ1_1S <- function( MOSAiCS_Z0, Y, pNfit, Y_bd_all, k=3 )
+{
+    ##################################################################
+    ### initialization of the main EM
+    ##################################################################
+
+    a <- MOSAiCS_Z0$a
+    #mu_est <- get_mu_est(M, GC, MOSAiCS_Z0, trunc_GC=0.55)
+    mu_est <- MOSAiCS_Z0$muEst
+    b_est <- a / mu_est
+    pi0 <- MOSAiCS_Z0$pi0
+    tab_Y <- MOSAiCS_Z0$Y_freq
+    Y_u <- MOSAiCS_Z0$Y_val
+
+
+    # initial mu & var
+
+    mu1_init <- mean(Y_bd_all)
+    var1_init <- var(Y_bd_all)
+
+
+    # calculate initial EN and varN, based on initial mu & var
+
+    EN <- mean(mu_est)
+    varN <- EN*( 1 + EN/a )
+
+    if( mu1_init < EN )
+    {
+        EN <- mu1_init - 0.1
+    }
+
+    if( var1_init < varN )
+    {
+        varN <- var1_init - 0.15
+    }
+
+
+    # initialize b & c
+
+    if( var1_init - varN - mu1_init + EN <= 0 )
+    {
+        b_init <- (mu1_init - EN)^2 / 0.1
+        c_init <- (mu1_init - EN) / 0.1
+    } else
+    {
+        b_init <- (mu1_init - EN)^2 / (var1_init - varN - mu1_init + EN)
+        c_init <- (mu1_init - EN) / (var1_init - varN - mu1_init + EN)
+    }
+
+
+    ##################################################################
+    ### The main EM calculation: iterate till convergence
+    ##################################################################
+
+    #print( "initialization for EM" )
+
+    # calculate EN and vanN
+    # (redefine EN after heuristic EN adjustment for initialization)
+
+    EN <- mean(mu_est)
+    varN <- EN*(1 + EN/a)
+
+
+    # parameters for Y>=k
+
+    id_geqk <- which(Y>=k)
+    Y_ori <- Y[id_geqk]
+    Y_Z1 <- Y - k               # adjusted Y
+    Y_Z1 <- Y_Z1[id_geqk]
+    #M_Z1 <- M[id_geqk]
+    #GC_Z1 <- GC[id_geqk]
+    b_est_Z1 <- b_est[id_geqk]
+    mu_est_Z1 <- mu_est[id_geqk]
+
+    Yk <- Y_ori - k        # use only Y >= k
+    #if(length(which(Y<0))>0 ) Y[which(Y<0)] <- -1
+    Ykmax <- max(Yk)
+    #ind_ge_k <- which(Y>=0)
+
+
+    # Initialization of the main EM calculation
+
+    #PYZ0 <- dnbinom( Y_ori, a, b_est_Z1/(b_est_Z1+1) )
+    PYZ0 <- pNfit$PYZ0
+    #pNfit <- .calcPN( Y_ori, k, a, mu_est_Z1 )
+    #PYZ1 <- .margDistZ1_1S( Y_ori, pNfit, b_init, c_init )
+    PYZ1 <- .margDistZ1_1S( Yk, Ykmax, pNfit, b_init, c_init )
+
+    b_iter <- b_init
+    c_iter <- c_init
+
+
+    # main EM iteration
+
+    eps <- 1e-6
+    #logLik <- -Inf
+    #logLik <- c(logLik, sum(log(pi0*PYZ0 + (1-pi0)*PYZ1)))
+    logLik1 <- sum( log( pi0*PYZ0 + (1-pi0)*PYZ1 ) )
+    logLik <- c( -Inf, logLik1 )
+    iter <- 2
+
+    #print( "simulation" )
+
+    while( abs(logLik[iter]-logLik[iter-1])>eps & iter < 10 )
+    {
+        # E-step
+
+        Z_latent <- (1 - pi0)*PYZ1 / ( pi0*PYZ0 + (1 - pi0)*PYZ1 )
+
+        # M-step
+
+        mu1 <- sum(Z_latent*Y_Z1) / sum(Z_latent)
+        var1 <- sum(Z_latent*(Y_Z1 - mu1)^2) / sum(Z_latent)
+
+        b <- (mu1 - EN)^2 / (var1 - varN - mu1 + EN)
+        c <- (mu1 - EN) / (var1 - varN - mu1 + EN)
+
+        # stop iteration if assumptions are not satisfied
+
+        if ( b<0 | c<0 )
+        {
+            b <- b_iter[(iter-1)]
+            c <- c_iter[(iter-1)]
+            break
+        }
+
+        # calculate P(Y|Z=1)
+
+        #print( "calculate P(Y|Z=1)" )
+
+        #PYZ1 <- .margDistZ1_1S( Y_ori, pNfit, b, c )
+        PYZ1 <- .margDistZ1_1S( Yk, Ykmax, pNfit, b, c )
+
+        # update iteration
+
+        #logLik <- c(logLik, sum(log(pi0*PYZ0 + (1-pi0)*PYZ1)))
+        logLik_t <- sum( log( pi0*PYZ0 + (1-pi0)*PYZ1 ) )
+        if ( is.na(logLik_t) | is.nan(logLik_t) ) {
+            b <- b_iter[(iter-1)]
+            c <- c_iter[(iter-1)]
+            logLik_t <- logLik[(iter-1)]
+            break
+        }
+        logLik <- c( logLik, logLik_t )
+        b_iter <- c(b_iter, b)
+        c_iter <- c(c_iter, c)
+
+        iter <- iter + 1
+    }
+
+
+    #return(list(M_u = MOSAiCS_Z0$M_u, GC_u = MOSAiCS_Z0$GC_u, a_u = MOSAiCS_Z0$a_u, b_u = MOSAiCS_Z0$b_u, mean0_u = MOSAiCS_Z0$mean0_u, var0_u = MOSAiCS_Z0$var0_u, u0_u = MOSAiCS_Z0$u0_u, u1_u = MOSAiCS_Z0$u1_u, u2_u = MOSAiCS_Z0$u2_u, n_u = MOSAiCS_Z0$n_u, ty_u = MOSAiCS_Z0$ty_u, Y_val = MOSAiCS_Z0$Y_val, Y_freq = MOSAiCS_Z0$Y_freq, pi0 = MOSAiCS_Z0$pi0, a = MOSAiCS_Z0$a, beta0_strata = MOSAiCS_Z0$beta0_strata, betaM_strata = MOSAiCS_Z0$betaM_strata, betaM2_strata = MOSAiCS_Z0$betaM2_strata, betaGC_strata = MOSAiCS_Z0$betaGC_strata, b = b, c = c, b_init = b_init, c_init = c_init, logLik = logLik, b_iter = b_iter, c_iter = c_iter))
+
+    #return( list(
+    #    M_u = MOSAiCS_Z0$M_u, GC_u = MOSAiCS_Z0$GC_u, a_u = MOSAiCS_Z0$a_u, b_u = MOSAiCS_Z0$b_u,
+    #    mean0_u = MOSAiCS_Z0$mean0_u, var0_u = MOSAiCS_Z0$var0_u, n_u = MOSAiCS_Z0$n_u, ty_u = MOSAiCS_Z0$ty_u,
+    #    Y_val = MOSAiCS_Z0$Y_val, Y_freq = MOSAiCS_Z0$Y_freq,
+    #    pi0 = MOSAiCS_Z0$pi0, a = MOSAiCS_Z0$a, b = b, c = c,
+    #    mu_est = mu_est ) )
+
+    return( list(
+        pi0 = MOSAiCS_Z0$pi0, a = MOSAiCS_Z0$a, muEst = MOSAiCS_Z0$muEst,
+        b = b, c = c ) )
+}
+
+#.margDistZ1_1S <- function( Yori, pNfit, b, c )
+.margDistZ1_1S <- function( Y, Ymax, pNfit, b, c )
+{
+    k <- pNfit$k
+    pN <- pNfit$pN
+    mu_round <- pNfit$mu_round
+    mu_round_U <- pNfit$mu_round_U
+
+    # process Y
+
+    #Y <- Yori - k        # use only Y >= k
+    #if(length(which(Y<0))>0 ) Y[which(Y<0)] <- -1
+    #Ymax <- max(Y)
+    #ind_ge_k <- which(Y>=0)
+
+    # prob of S
+
+    pS <- dnbinom( 0:Ymax, b, c/(c+1) )
+    #MDZ1 <- rep( 0, length(Y) )
+    #MDZ1[ ind_ge_k ] <- conv_1S( y=Y[ind_ge_k], mu_round=mu_round[ind_ge_k],
+    #    mu_round_U=mu_round_U, pN=pN, pS=pS )
+    MDZ1 <- conv_1S( y=Y, mu_round=mu_round,
+        mu_round_U=mu_round_U, pN=pN, pS=pS )
+
+    return(MDZ1)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_mosaicsZ1_2S.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,458 @@
+
+#########################################################
+# Z1 with mixture of 2 signal components
+#########################################################
+
+.mosaicsZ1_2S <- function( MOSAiCS_Z0, Y, pNfit, Y_bd_all, k=3 )
+{
+    ##################################################################
+    ### initialization of the main EM
+    ##################################################################
+
+    a <- MOSAiCS_Z0$a
+    #mu_est <- get_mu_est(M, GC, MOSAiCS_Z0, trunc_GC=0.55)
+    mu_est <- MOSAiCS_Z0$muEst
+    b_est <- a / mu_est
+    pi0 <- MOSAiCS_Z0$pi0
+    tab_Y <- MOSAiCS_Z0$Y_freq
+    Y_u <- MOSAiCS_Z0$Y_val
+
+
+    # initial EM calculation
+
+    par_2mixNB <- .emZ1_2S( Y_bd_all, epsilon=0.0001,
+        b1_old=NULL, c1_old=NULL, b2_old=NULL, c2_old=NULL )
+
+    mu1_init <- par_2mixNB$b1/par_2mixNB$c1
+    mu2_init <- par_2mixNB$b2/par_2mixNB$c2
+
+    var1_init <- mu1_init*(1+1/par_2mixNB$c1)
+    var2_init <- mu2_init*(1+1/par_2mixNB$c2)
+
+
+    # calculate initial EN and varN, based on initial EM calculation
+
+    if( min(mu1_init,mu2_init) > mean(mu_est) )
+    {
+        EN <- mean(mu_est)
+    } else
+    {
+        if( min(mu1_init,mu2_init) > median(mu_est) )
+        {
+            EN <- median(mu_est)
+        } else
+        {
+            EN <- min(mu1_init,mu2_init) - 0.1
+        }
+    }
+
+    varN <- EN*(1 + EN/a)
+
+    if( min(var1_init,var2_init) < varN )
+    {
+        varN <- min(var1_init,var2_init) - 0.15
+    }
+
+
+    # initialize b1, c1, b2, and c2
+
+    if( var1_init - varN - mu1_init + EN <= 0 )
+    {
+        b1_init <- (mu1_init - EN)^2 / 0.1
+        c1_init <- (mu1_init - EN) / 0.1
+    } else
+    {
+        b1_init <- (mu1_init - EN)^2 / (var1_init - varN - mu1_init + EN)
+        c1_init <- (mu1_init - EN) / (var1_init - varN - mu1_init + EN)
+    }
+
+    if( var2_init - varN - mu2_init + EN <= 0 )
+    {
+        b2_init <- (mu2_init - EN)^2 / 0.1
+        c2_init <- (mu2_init - EN) / 0.1
+    } else
+    {
+        b2_init <- (mu2_init - EN)^2 / (var2_init - varN - mu2_init + EN)
+        c2_init <- (mu2_init - EN) / (var2_init - varN - mu2_init + EN)
+    }
+
+
+    # take care of identifiability problem
+
+    mS1 <- b1_init/c1_init
+    mS2 <- b2_init/c2_init
+
+    if(mS1 <= mS2)
+    {
+        # OK. Good to go.
+
+        p1_init <- par_2mixNB$p1
+
+        b1_init <- b1_init
+        c1_init <- c1_init
+        b2_init <- b2_init
+        c2_init <- c2_init
+    } else
+    {
+        # Let's swap them.
+
+        p1_init <- par_2mixNB$p2    # p2 = 1 - p1
+
+        b1_init_temp <- b2_init
+        c1_init_temp <- c2_init
+        b2_init_temp <- b1_init
+        c2_init_temp <- c1_init
+
+        b1_init <- b1_init_temp
+        c1_init <- c1_init_temp
+        b2_init <- b2_init_temp
+        c2_init <- c2_init_temp
+    }
+
+
+    ##################################################################
+    ### The main EM calculation: iterate till convergence
+    ##################################################################
+
+    # calculate EN and vanN
+    # (redefine EN after heuristic EN adjustment for initialization)
+
+    EN <- mean(mu_est)
+    varN <- EN*(1 + EN/a)
+
+
+    # parameters for Y>=k
+
+    id_geqk <- which(Y>=k)
+    Y_ori <- Y[id_geqk]
+    Y_Z1 <- Y - k               # adjusted Y
+    Y_Z1 <- Y_Z1[id_geqk]
+    #M_Z1 <- M[id_geqk]
+    #GC_Z1 <- GC[id_geqk]
+    b_est_Z1 <- b_est[id_geqk]
+    mu_est_Z1 <- mu_est[id_geqk]
+
+    Yk <- Y_ori - k        # use only Y >= k
+    #if(length(which(Y<0))>0 ) Y[which(Y<0)] <- -1
+    Ykmax <- max(Yk)
+    #ind_ge_k <- which(Y>=0)
+
+
+    # Initialization of the main EM calculation
+
+    #PYZ0 <- dnbinom( Y_ori, a, b_est_Z1/(b_est_Z1+1) )
+    PYZ0 <- pNfit$PYZ0
+    #pNfit <- .calcPN( Y_ori, k, a, mu_est_Z1 )
+    #PYZ1 <- .margDistZ1_2S( Y_ori, pNfit, b1_init, c1_init, b2_init, c2_init )
+    PYZ1 <- .margDistZ1_2S( Yk, Ykmax, pNfit, b1_init, c1_init, b2_init, c2_init )
+
+    PYZ1G1 <- PYZ1$MDG1
+    PYZ1G2 <- PYZ1$MDG2
+
+    b1_iter <- b1_init
+    c1_iter <- c1_init
+    b2_iter <- b2_init
+    c2_iter <- c2_init
+    p1 <- p1_init
+    p1_iter <- p1_init
+
+
+    # main EM iteration
+
+    eps <- 1e-6
+    logLik1 <- sum( log( pi0*PYZ0 + (1-pi0)*( p1*PYZ1G1 + (1-p1)*PYZ1G2) ) )
+    logLik <- c( -Inf, logLik1 )
+    iter <- 2
+
+    while( abs(logLik[iter]-logLik[iter-1])>eps & iter < 10 )
+    {
+        # E-step
+
+        denom_G_latent <- p1*PYZ1G1 + (1-p1)*PYZ1G2
+
+        G_latent <- p1*PYZ1G1 / denom_G_latent
+        Z_latent <- (1 - pi0)*denom_G_latent / ( pi0*PYZ0 + (1 - pi0)*denom_G_latent )
+
+        # M-step
+
+        p1 <- sum(G_latent*Z_latent) / sum(Z_latent)
+
+        mu1 <- sum(Z_latent*G_latent*Y_Z1) / sum(Z_latent*G_latent)
+        var1 <- sum(Z_latent*G_latent*(Y_Z1 - mu1)^2) / sum(Z_latent*G_latent)
+
+        mu2 <- sum(Z_latent*(1-G_latent)*Y_Z1) / sum(Z_latent*(1-G_latent))
+        var2 <- sum(Z_latent*(1-G_latent)*(Y_Z1 - mu2)^2) / sum(Z_latent*(1-G_latent))
+
+        b1 <- (mu1 - EN)^2 / (var1 - varN - mu1 + EN)
+        c1 <- (mu1 - EN) / (var1 - varN - mu1 + EN)
+
+        b2 <- (mu2 - EN)^2 / (var2 - varN - mu2 + EN)
+        c2 <- (mu2 - EN) / (var2 - varN - mu2 + EN)
+
+        # stop iteration if assumptions are not satisfied
+
+        if ( p1<0.01 | b1<0 | c1<0 | b2<0 | c2<0 )
+        {
+            p1 <- p1_iter[(iter-1)]
+            b1 <- b1_iter[(iter-1)]
+            c1 <- c1_iter[(iter-1)]
+            b2 <- b2_iter[(iter-1)]
+            c2 <- c2_iter[(iter-1)]
+            break
+        }
+
+        # calculate P(Y|Z=1,G=1) and P(Y|Z=1,G=2)
+
+        #print( "calculate P(Y|Z=1,G=1) and P(Y|Z=1,G=2)" )
+
+        #PYZ1 <- .margDistZ1_2S( Y_ori, pNfit, b1, c1, b2, c2)
+        PYZ1 <- .margDistZ1_2S( Yk, Ykmax, pNfit, b1, c1, b2, c2)
+
+        PYZ1G1 <- PYZ1$MDG1
+        PYZ1G2 <- PYZ1$MDG2
+
+        # update iteration
+
+        logLik_t <- sum( log( pi0*PYZ0 + (1-pi0)*( p1*PYZ1G1 + (1-p1)*PYZ1G2) ) )
+        if ( is.na(logLik_t) | is.nan(logLik_t) ) {
+            p1 <- p1_iter[(iter-1)]
+            b1 <- b1_iter[(iter-1)]
+            c1 <- c1_iter[(iter-1)]
+            b2 <- b2_iter[(iter-1)]
+            c2 <- c2_iter[(iter-1)]
+            logLik_t <- logLik[(iter-1)]
+            break
+        }
+        logLik <- c( logLik, logLik_t )
+        p1_iter <- c(p1_iter, p1)
+        b1_iter <- c(b1_iter, b1)
+        c1_iter <- c(c1_iter, c1)
+        b2_iter <- c(b2_iter, b2)
+        c2_iter <- c(c2_iter, c2)
+
+        iter <- iter + 1
+    }
+
+    return( list(
+        pi0 = MOSAiCS_Z0$pi0, a = MOSAiCS_Z0$a, muEst = MOSAiCS_Z0$muEst,
+        p1 = p1, b1 = b1, c1 = c1, b2 = b2, c2 = c2 ) )
+}
+
+
+# calculate P(Y|Z=1,G=1) and P(Y|Z=1,G=2) for the current parameters
+
+#.margDistZ1_2S <- function( Yori, pNfit, b1, c1, b2, c2 )
+.margDistZ1_2S <- function( Y, Ymax, pNfit, b1, c1, b2, c2 )
+# Y <- Yori - k
+{
+    k <- pNfit$k
+    pN <- pNfit$pN
+    mu_round <- pNfit$mu_round
+    mu_round_U <- pNfit$mu_round_U
+
+    # process Y
+
+    #Y <- Yori - k        # use only Y >= k
+    #if(length(which(Y<0))>0 ) Y[which(Y<0)] <- -1
+    #Ymax <- max(Y)
+    #ind_ge_k <- which(Y>=0)
+
+    # prob of S1 & S2
+
+    pS1 <- dnbinom( 0:Ymax, b1, c1/(c1+1) )
+    pS2 <- dnbinom( 0:Ymax, b2, c2/(c2+1) )
+    MDGfit <- conv_2S( y=Y, mu_round=mu_round,
+        mu_round_U=mu_round_U, pN=pN, pS1=pS1, pS2=pS2 )
+    MDG <- matrix( MDGfit, ncol=2, byrow=TRUE )
+
+    return( list( MDG1 = MDG[,1], MDG2 = MDG[,2] ) )
+
+}
+
+
+#########################################################
+# EM algorithm to initialize MOSAiCS_Z1_2S
+#########################################################
+
+.calMDZ1_2S <- function(b1,c1,b2,c2,Y_freq,Y_val)
+{
+    # NB distribution
+
+    MD <- matrix(0,nrow=length(Y_val),ncol=2)
+    MD[,1] <- dnbinom(Y_val,size=b1,prob=c1/(c1+1))
+    MD[,2] <- dnbinom(Y_val,size=b2,prob=c2/(c2+1))
+
+    id <- which( apply(MD,1,sum)==0 )
+    if(length(id)>0){
+        replaceID <- min(id)-1
+        MD[id,1] <- MD[replaceID,1]
+        MD[id,2] <- MD[replaceID,2]
+    }
+    return(MD)
+}
+
+.eStepZ1_2S <- function(MD,pZ,Y_freq,Y_val)
+{
+
+    p1 <- sum(pZ[,1]*Y_freq)/sum(Y_freq)
+    p2 <- 1-p1
+
+    MD1 <- MD[,1]
+    MD2 <- MD[,2]
+
+    pZ[,1] <- (MD1*p1)/(MD1*p1+MD2*p2)
+    pZ[,2] <- (MD2*p2)/(MD1*p1+MD2*p2)
+
+    return(pZ)
+}
+
+.mStepZ1_2S <- function(pZ,Y_freq,Y_val,b1_old=NULL,c1_old=NULL,b2_old=NULL,c2_old=NULL)
+{
+    pZ1 <- pZ[,1]
+    pZ2 <- pZ[,2]
+    p1 <- sum(pZ1*Y_freq) / sum(Y_freq)
+    p2 <- 1-p1
+
+    mu1 <- sum(pZ1*Y_val*Y_freq) / sum(pZ1*Y_freq)
+    var1 <- sum(pZ1*Y_freq*(Y_val-mu1)^2) / sum(pZ1*Y_freq)
+    mu2 <- sum(pZ2*Y_val*Y_freq) / sum(pZ2*Y_freq)
+    var2 <- sum(pZ2*Y_freq*(Y_val-mu2)^2) / sum(pZ2*Y_freq)
+
+    if ( is.na(mu1) || is.na(var1) || is.nan(mu1) || is.nan(var1) ) {
+        # inappropriate values occur
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    }
+
+    if ( is.na(mu2) || is.na(var2) || is.nan(mu2) || is.nan(var2) ) {
+        # inappropriate values occur
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    }
+
+    if ( var1 < mu1 ) { var1 = mu1 + 0.1 }
+    if ( var2 < mu2 ) { var2 = mu2 + 0.1 }
+
+    if ( length(b1_old)==0 ) { b1 <- mu1^2/(var1-mu1) } else { b1 <- b1_old }
+    if ( length(c1_old)==0 ) { c1 <- mu1/(var1-mu1) } else { c1 <- c1_old }
+    if ( length(b2_old)==0 ) { b2 <- mu2^2/(var2-mu2) } else { b2 <- b2_old }
+    if ( length(c2_old)==0 ) { c2 <- mu2/(var2-mu2) } else { c2 <- c2_old }
+
+    MD <- .calMDZ1_2S(b1,c1,b2,c2,Y_freq,Y_val)
+    logLik <- sum( log(p1*MD[,1]+p2*MD[,2]) * Y_freq )
+
+    return( list( b1=b1, c1=c1, b2=b2, c2=c2, p1=p1, p2=p2, MD=MD, logLik=logLik ) )
+}
+
+
+.emZ1_2S <- function( Y, epsilon, b1_old=NULL, c1_old=NULL, b2_old=NULL, c2_old=NULL )
+{
+    Y_freq <- table(Y)
+    Y_val <- as.numeric(names(table(Y)))
+
+    # initialize b1, c1, b2, and c2
+
+    ind_0.5 <- which( Y <= quantile(Y,0.5) )
+    mean1 <- mean(Y[ind_0.5])
+    var1 <- var(Y[ind_0.5])
+
+    ind_0.8 <- which( Y > quantile(Y,0.8) )
+    mean2 <- mean(Y[ind_0.8])
+    var2 <- var(Y[ind_0.8])
+
+    if(mean1 == 0||is.na(mean1)==TRUE)
+    {
+        ind_0.6 <- which( Y <= quantile(Y,0.6) )
+        mean1 <- mean(Y[ind_0.6])
+        var1 <- var(Y[ind_0.6])
+    }
+
+    if(mean1 == 0||is.na(mean1)==TRUE)
+    {
+        ind_0.7 <- which( Y <= quantile(Y,0.7) )
+        mean1 <- mean(Y[ind_0.7])
+        var1 <- var(Y[ind_0.7])
+    }
+
+    if(mean1 == 0||is.na(mean1)==TRUE)
+    {
+        ind_0.8 <- which( Y <= quantile(Y,0.8) )
+        mean1 <- mean(Y[ind_0.8])
+        var1 <- var(Y[ind_0.8])
+    }
+
+    if(mean1 == 0||is.na(mean1)==TRUE)
+    {
+        ind_0.9 <- which( Y <= quantile(Y,0.9) )
+
+        mean1 <- mean(Y[ind_0.9])
+        var1 <- var(Y[ind_0.9])
+
+        ind_0.9 <- which( Y > quantile(Y,0.9) )
+
+        mean2 <- mean(Y[ind_0.9])
+        var2 <- var(Y[ind_0.9])
+    }
+
+    if ( is.na(mean1) || is.na(var1) || is.nan(mean1) || is.nan(var1) ) {
+        # inappropriate values occur
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    }
+
+    if ( is.na(mean2) || is.na(var2) || is.nan(mean2) || is.nan(var2) ) {
+        # inappropriate values occur
+        stop( "over-estimation of background detected. Please tune the parameters!" )
+    }
+
+    if (var1 < mean1) { var1 <- mean1+0.1 }
+    if (length(b1_old)==0) { b1 <- mean1^2/(var1-mean1) } else { b1 <- b1_old }
+    if (length(c1_old)==0) { c1 <- mean1/(var1-mean1) } else { c1 <- c1_old }
+    if (var2 < mean2) { var2 <- mean2+0.1 }
+    if (length(b2_old)==0) { b2 <- mean2^2/(var2-mean2) } else { b2 <- b2_old }
+    if (length(c2_old)==0) { c2 <- mean2/(var2-mean2) } else { c2 <- c2_old }
+
+
+    # initialize EM
+
+    MD_2mixNB  <- .calMDZ1_2S(b1,c1,b2,c2,Y_freq,Y_val)
+    pZ_old <- matrix(0.5,nrow=length(Y_val),ncol=2)
+
+    pZ <- .eStepZ1_2S(MD_2mixNB,pZ_old,Y_freq,Y_val)
+    pZ_old <- pZ
+    par_2mixNB <- .mStepZ1_2S(pZ,Y_freq,Y_val,b1_old,c1_old,b2_old,c2_old)
+
+    logLik <- par_2mixNB$logLik
+    absdif <- Inf
+    i <- 2
+
+
+    # EM iterations
+
+    while( absdif > epsilon & i<=10000 )
+    {
+        # E-step
+
+        par_2mixNB_old <- par_2mixNB
+        pZ <- .eStepZ1_2S(par_2mixNB$MD,pZ_old,Y_freq,Y_val)
+
+        # M-step
+
+        pZ_old <- pZ
+        par_2mixNB <- .mStepZ1_2S(pZ,Y_freq,Y_val,b1_old,c1_old,b2_old,c2_old)
+
+        # update log lik
+
+        logLik_old <- logLik
+        if(i==10000)
+        {
+            logLik <- logLik_old
+        } else
+        {
+            logLik <- par_2mixNB$logLik
+        }
+        absdif <- abs( logLik - logLik_old )
+
+        i=i+1
+    }
+    par_2mixNB$logLik <- logLik
+
+    return(par_2mixNB)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_peakCall.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,371 @@
+
+# calculate posterior probabilities (1 signal model)
+
+.getPH_1S <- function( margDensity, pi0 )
+{
+    # take parameters
+
+    MDZ0 <- margDensity$MDZ0
+    MDZ1 <- margDensity$MDZ1
+
+
+    # calculate posterior probabilities
+
+    denom <- MDZ0*pi0 + MDZ1*(1-pi0)
+    pH0 = MDZ0 * pi0 / denom
+    pH1 = MDZ1 * (1-pi0) / denom
+
+    if( length(which(is.na(pH0))) > 0 )
+    {
+        pH0[ is.na(pH0) ] = 1
+        pH1[ is.na(pH0) ] = 0
+    }
+    post_prob = list( pH0=pH0, pH1=pH1 )
+
+    return( post_prob )
+}
+
+
+# calculate posterior probabilities (2 signal model)
+
+.getPH_2S <- function( margDensity, pi0, p1 )
+{
+    # take parameters
+
+    MDZ0 <- margDensity$MDZ0
+    MDZ1 <- margDensity$MDZ1
+    MDZ2 <- margDensity$MDZ2
+
+
+    # calculate posterior probabilities
+
+    denom <- MDZ0*pi0 + (1-pi0) * ( MDZ1*p1 + MDZ2*(1-p1) )
+    pH0 = MDZ0 * pi0 / denom
+    pH1 = MDZ1 * (1-pi0) * p1 / denom
+    pH2 = MDZ2 * (1-pi0) * (1-p1) / denom
+
+
+    if( length(which(is.na(pH0))) > 0 )
+    {
+        pH0[ is.na(pH0) ] = 1
+        pH1[ is.na(pH0) ] = 0
+        pH2[ is.na(pH0) ] = 0
+    }
+    post_prob = list( pH0=pH0, pH1=pH1, pH2=pH2 )
+
+    return( post_prob )
+}
+
+
+# claim peaks
+
+.peakCall <- function( postProb, dataSet, FDR, binsize=NA,
+    maxgap=200, minsize=50, thres=10, analysisType )
+{
+    #library(IRanges)
+
+    # FDR = direct posterior probability approach (Newton et al., 2004, Biostatistics)
+
+    # determine peaks
+
+    betapH <- postProb$pH0
+    betapH_s <- sort(betapH)
+    sbetapH <- cumsum(betapH_s) / c(1:length(betapH))       # expected rate of false discoveries
+    id <- which( sbetapH <= FDR )
+
+    if(length(id)>0)
+    {
+        #################### if peaks exist
+
+        chrID <- dataSet$chrID
+        coord <- dataSet$coord
+        Y <- dataSet$Y
+        switch( analysisType,
+            OS = {
+                M <- dataSet$M
+                GC <- dataSet$GC
+            },
+            TS = {
+                X <- dataSet$X
+                M <- dataSet$M
+                GC <- dataSet$GC
+            },
+            IO = {
+                X <- dataSet$X
+            }
+        )
+
+
+        # threshold peaks by min tag count & determine peaks
+
+        cutoff <- betapH_s[max(id)]
+        bd_bin <- rep( 0, length(Y) )
+        bd_bin[ betapH<=cutoff & Y>thres ] <- 1
+
+        if ( length(which( betapH<=cutoff & Y>thres )) > 0 ) {
+            # if we still have peaks after tag count thresholding
+
+            # empirical FDR
+
+            empFDR <- sum(betapH[which(betapH<=cutoff)]) / length(which(betapH<=cutoff))
+            empFDR_thres <- sum(betapH[which( betapH<=cutoff & Y>thres )]) /
+                length(which( betapH<=cutoff & Y>thres ))
+            #cat( "Info: empirical FDR (before thresholding) = ",
+            #    round(1000*empFDR)/1000, "\n", sep="" )
+            #cat( "Info: empirical FDR (after thresholding) = ",
+            #    round(1000*empFDR_thres)/1000, "\n", sep="" )
+
+            # binsize calculation
+
+            if ( is.na(binsize) ) {
+                binsize <- min(abs( diff(coord) ))
+            }
+
+            # process peaks for each chromosome
+
+            chrList <- sort(unique(chrID))
+            final_peakset <- c()
+
+            bd_bin_list <- split( bd_bin, chrID )
+            betapH_list <- split( betapH, chrID )
+            coord_list <- split( coord, chrID )
+            Y_list <- split( Y, chrID )
+            switch( analysisType,
+                OS = {
+                    M_list <- split( M, chrID )
+                    GC_list <- split( GC, chrID )
+                },
+                TS = {
+                    X_list <- split( X, chrID )
+                    M_list <- split( M, chrID )
+                    GC_list <- split( GC, chrID )
+                },
+                IO = {
+                    X_list <- split( X, chrID )
+                }
+            )
+
+            for ( chr in 1:length(chrList) ) {
+                # extract data for given chromosome
+
+                bd_bin_chr <- bd_bin_list[[ chrList[chr] ]]
+                betapH_chr <- betapH_list[[ chrList[chr] ]]
+                coord_chr <- coord_list[[ chrList[chr] ]]
+                Y_chr <- Y_list[[ chrList[chr] ]]
+                switch( analysisType,
+                    OS = {
+                        M_chr <- M_list[[ chrList[chr] ]]
+                        GC_chr <- GC_list[[ chrList[chr] ]]
+                    },
+                    TS = {
+                        X_chr <- X_list[[ chrList[chr] ]]
+                        M_chr <- M_list[[ chrList[chr] ]]
+                        GC_chr <- GC_list[[ chrList[chr] ]]
+                    },
+                    IO = {
+                        X_chr <- X_list[[ chrList[chr] ]]
+                    }
+                )
+
+                # generate initial peak list
+
+                bd_ID <- which(bd_bin_chr==1)           # initial peak (bin-level)
+
+                if ( length(bd_ID) > 0 ) {
+                    # to take care of the case that there is no peak in this chromosome
+
+                    #indRanges <- IRanges( start=bd_ID, end=bd_ID+1 )
+                    #reducedRanges <- reduce(indRanges)  # merge nearby peaks
+
+                    #binsize <- coord_chr[2] - coord_chr[1]
+                    #peak_start <- coord_chr[ start(reducedRanges) ]
+                    #peak_stop <- coord_chr[ end(reducedRanges)-1 ] + binsize - 1
+                    #peak_start <- coord_chr[ start(indRanges) ]
+                    #peak_stop <- coord_chr[ end(indRanges)-1 ] + binsize - 1
+
+                    peak_range <- IRanges( start=coord_chr[ bd_ID ],
+                        end=coord_chr[ bd_ID ] + binsize - 1 )
+                    peak_reduced <- reduce(peak_range)
+                    peak_start <- start(peak_reduced)
+                    peak_stop <- end(peak_reduced)
+
+                    # merge close peaks if distance<=maxgap
+
+                    coord_org <- IRanges( start=peak_start, end=peak_stop+maxgap )
+                    coord_merged <- reduce(coord_org)
+                    peak_start <- start(coord_merged)
+                    peak_stop <- end(coord_merged) - maxgap
+
+                    # filter peaks smaller than minsize & order by coordinates
+
+                    peaksize <- peak_stop - peak_start + 1
+                    filterID = which( peaksize <= minsize )
+                    if ( length(filterID) > 0 )
+                    {
+                        peak_start <- peak_start[ -filterID ]
+                        peak_stop <- peak_stop[ -filterID ]
+                    }
+                    peak_start <- peak_start[ order(peak_start) ]
+                    peak_stop <- peak_stop[ order(peak_start) ]
+                    peaksize <- peak_stop - peak_start + 1
+
+                    #print(quantile(peaksize))
+
+                    # calculate additional info
+
+                    peak_range <- IRanges( start=peak_start, end=peak_stop )
+                    info_range <- IRanges( start=coord_chr, end=coord_chr )
+                    mm <- as.matrix( findOverlaps( peak_range, info_range ) )
+                    matchlist <- split( mm[,2], mm[,1] )
+
+                    switch( analysisType,
+                        OS = {
+                            # extract info
+
+                            peak_info_list <- lapply( matchlist,
+                                function(x) {
+                                    betapH_i <- betapH_chr[x]
+                                    aveP <- mean(betapH_i)
+                                    minP <- min(betapH_i)
+                                    aveChipCount <- mean(Y_chr[x])
+                                    maxChipCount <- max(Y_chr[x])
+                                    aveM <- mean(M_chr[x])
+                                    aveGC <- mean(GC_chr[x])
+                                    return( c( aveP, minP, aveChipCount, maxChipCount, aveM, aveGC ) )
+                                }
+                            )
+
+                            peak_info <- matrix( NA, length(peak_start), length(peak_info_list[[1]]) )
+                            for ( pii in 1:length(peak_start) ) {
+                                peak_info[ pii, ] <- peak_info_list[[pii]]
+                            }
+
+                            # combine all
+
+                            final_peakset_chr <- data.frame( rep(chrList[chr],length(peak_start)),
+                                peak_start, peak_stop, peaksize, peak_info,
+                                stringsAsFactors=FALSE )
+                            colnames(final_peakset_chr) <-
+                                c('chrID','peakStart','peakStop','peakSize','aveP','minP',
+                                'aveChipCount','maxChipCount','map','GC')
+                        },
+                        TS = {
+                            # extract info
+
+                            nRatio <- sum(Y) / sum(X)
+                                # genome-wide sequencing depth adjustment
+
+                            peak_info_list <- lapply( matchlist,
+                                function(x) {
+                                    betapH_i <- betapH_chr[x]
+                                    aveP <- mean(betapH_i)
+                                    minP <- min(betapH_i)
+                                    aveChipCount <- mean(Y_chr[x])
+                                    maxChipCount <- max(Y_chr[x])
+                                    aveInputCount <- mean(X_chr[x])
+                                    aveInputCountScaled <- aveInputCount * nRatio
+                                    aveLog2Ratio <- mean( log2( (Y_chr[x]+1) / (X_chr[x]*nRatio+1) ) )
+                                    aveM <- mean(M_chr[x])
+                                    aveGC <- mean(GC_chr[x])
+                                    return( c( aveP, minP, aveChipCount, maxChipCount,
+                                        aveInputCount, aveInputCountScaled, aveLog2Ratio, aveM, aveGC ) )
+                                }
+                            )
+
+                            peak_info <- matrix( NA, length(peak_start), length(peak_info_list[[1]]) )
+                            for ( pii in 1:length(peak_start) ) {
+                                peak_info[ pii, ] <- peak_info_list[[pii]]
+                            }
+
+                            # combine all
+
+                            final_peakset_chr <- data.frame( rep(chrList[chr],length(peak_start)),
+                                peak_start, peak_stop, peaksize, peak_info,
+                                stringsAsFactors=FALSE )
+                            colnames(final_peakset_chr) <-
+                                c('chrID','peakStart','peakStop','peakSize','aveP','minP',
+                                'aveChipCount','maxChipCount',
+                                'aveInputCount','aveInputCountScaled','aveLog2Ratio','map','GC')
+                        },
+                        IO = {
+                            # extract info
+
+                            nRatio <- sum(Y) / sum(X)
+                                # genome-wide sequencing depth adjustment
+
+                            peak_info_list <- lapply( matchlist,
+                                function(x) {
+                                    betapH_i <- betapH_chr[x]
+                                    aveP <- mean(betapH_i)
+                                    minP <- min(betapH_i)
+                                    aveChipCount <- mean(Y_chr[x])
+                                    maxChipCount <- max(Y_chr[x])
+                                    aveInputCount <- mean(X_chr[x])
+                                    aveInputCountScaled <- aveInputCount * nRatio
+                                    aveLog2Ratio <- mean( log2( (Y_chr[x]+1) / (X_chr[x]*nRatio+1) ) )
+                                    return( c( aveP, minP, aveChipCount, maxChipCount,
+                                        aveInputCount, aveInputCountScaled, aveLog2Ratio ) )
+                                }
+                            )
+
+                            peak_info <- matrix( NA, length(peak_start), length(peak_info_list[[1]]) )
+                            for ( pii in 1:length(peak_start) ) {
+                                peak_info[ pii, ] <- peak_info_list[[pii]]
+                            }
+
+                            # combine all
+
+                            final_peakset_chr <- data.frame( rep(chrList[chr],length(peak_start)),
+                                peak_start, peak_stop, peaksize, peak_info,
+                                stringsAsFactors=FALSE )
+                            colnames(final_peakset_chr) <-
+                                c('chrID','peakStart','peakStop','peakSize','aveP','minP',
+                                'aveChipCount','maxChipCount',
+                                'aveInputCount','aveInputCountScaled','aveLog2Ratio')
+                        }
+                    )
+
+                    final_peakset <- rbind( final_peakset, final_peakset_chr )
+                }
+            }
+
+
+            #################### if peaks exist
+
+            bdBin <- data.frame( chrID, bd_bin, stringsAsFactors=FALSE )
+            colnames(bdBin) <- c("chrID","peak")
+
+            return( list( peakSet=final_peakset, bdBin=bdBin, empFDR=empFDR ) )
+        } else {
+            # if we lose all peaks after tag count thresholding
+
+            # empirical FDR
+
+            #empFDR <- sum(betapH[which(betapH<=cutoff)]) / length(which(betapH<=cutoff))
+            #empFDR_thres <- 0
+            #message( "Info: no peaks remain after thresholding")
+            #message( "Info: empirical FDR (before thresholding) = ", round(1000*empFDR)/1000 )
+            #message( "Info: empirical FDR (after thresholding) = ", round(1000*empFDR_thres)/1000 )
+
+            chrID <- dataSet$chrID
+            Y <- dataSet$Y
+
+            bdBin <- data.frame( chrID, rep(0,length(Y)), stringsAsFactors=FALSE )
+            colnames(bdBin) <- c("chrID","peak")
+
+            return( list( peakSet=NULL, bdBin=bdBin, empFDR=0 ) )
+
+        }
+    } else
+    {
+        #################### if there is no peak
+
+        chrID <- dataSet$chrID
+        Y <- dataSet$Y
+
+        bdBin <- data.frame( chrID, rep(0,length(Y)), stringsAsFactors=FALSE )
+        colnames(bdBin) <- c("chrID","peak")
+
+        return( list( peakSet=NULL, bdBin=bdBin, empFDR=0 ) )
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_plotGOF.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,99 @@
+
+#########################################################
+# Plot GOF
+#########################################################
+
+.plotGOF <- function( mosaicsEst, tagCount, input=0, k=3 )
+{
+    analysisType <- mosaicsEst@analysisType
+
+    # calculate basic parameter estimates
+
+    YFreq <- table(tagCount)
+    YVal <- as.numeric(names(YFreq))
+
+    if ( analysisType=="TS" | analysisType=="IO" ) {
+        XFreq <- table( input )
+        XVal <- as.numeric(names(XFreq))
+    }
+
+    N <- sum(YFreq)
+
+    a <- mosaicsEst@a
+    pi0 <- mosaicsEst@pi0
+    muEst <- mosaicsEst@muEst
+    bEst <- a / muEst
+
+    b <- mosaicsEst@b
+    c <- mosaicsEst@c
+
+    p1 <- mosaicsEst@p1
+    b1 <- mosaicsEst@b1
+    c1 <- mosaicsEst@c1
+    b2 <- mosaicsEst@b2
+    c2 <- mosaicsEst@c2
+
+    # simulate one sample signal
+
+    probZ_1S <- c( pi0, (1-pi0) )
+    set.seed(12345)
+    Zsim_1S <- sample( c(0,1), size=N, prob=probZ_1S, replace=TRUE )
+    set.seed(12345)
+    Ysim_1S <- rnbinom( N, a, bEst/(bEst+1) )
+    nS_1S <- length(which( Zsim_1S==1 ))
+    Ysim_1S[ Zsim_1S==1 ] <- Ysim_1S[ Zsim_1S==1 ] + rnbinom( nS_1S, b, c/(c+1) ) + k
+
+    #YsimZ0Freq <- table( Ysim_1S[ Zsim_1S==0 ] )
+    #YsimZ0Val <- as.numeric(names(YsimZ0Freq))
+
+    YsimFreq_1S <- table(Ysim_1S)
+    YsimVal_1S <- as.numeric(names(YsimFreq_1S))
+
+    # simulation two sample signal
+
+    probZ_2S <- c( pi0, (1-pi0)*p1, (1-pi0)*(1-p1) )
+    set.seed(12345)
+    Zsim_2S <- sample( c(0,1,2), size=N, prob=probZ_2S, replace=TRUE )
+    set.seed(12345)
+    Ysim_2S <- rnbinom( N, a, bEst/(bEst+1) )
+    nS_2S_1 <- length(which(Zsim_2S==1))
+    nS_2S_2 <- length(which(Zsim_2S==2))
+    Ysim_2S[ Zsim_2S==1 ] <- Ysim_2S[ Zsim_2S==1 ] + rnbinom( nS_2S_1, b1, c1/(c1+1) ) + k
+    Ysim_2S[ Zsim_2S==2 ] <- Ysim_2S[ Zsim_2S==2 ] + rnbinom( nS_2S_2, b2, c2/(c2+1) ) + k
+
+    YsimZ0Freq <- table( Ysim_2S[ Zsim_2S==0 ] )
+    YsimZ0Val <- as.numeric(names(YsimZ0Freq))
+
+    #Y_sim_Z0_freq <- table(Y_sim[which(Z_sim == 0)])
+    #Y_sim_Z0_val <- as.numeric(names(Y_sim_Z0_freq))
+
+    YsimFreq_2S <- table(Ysim_2S)
+    YsimVal_2S <- as.numeric(names(YsimFreq_2S))
+
+    # draw GOF plot
+
+    plot( log10(YVal+1),log10(YFreq), type='l', ylab='Frequency', xlab='Tag count', axes=FALSE )
+    if ( analysisType=="TS" | analysisType=="IO" ) {
+        points( log10(XVal+1),log10(XFreq), type='l', col='darkgray' )
+    }
+    points( log10(YsimZ0Val+1), log10(YsimZ0Freq), type='l', col='green' )
+    points( log10(YsimVal_1S+1), log10(YsimFreq_1S), type='l', col='red' )
+    points( log10(YsimVal_2S+1), log10(YsimFreq_2S), type='l', col='blue' )
+
+    if ( analysisType=="TS" | analysisType=="IO" ) {
+        legend( 1, log10(max(YFreq)+1),
+            c('Actual data (ChIP)','Actual data (Control)','Sim:N','Sim:N+S1','Sim:N+S1+S2'),
+            col=c('black','darkgray','green','red','blue'),lty=c(1,1,1,1,1),bty='n')
+    } else {
+        legend( 1, log10(max(YFreq)+1),
+            c('Actual data','Sim:N','Sim:N+S1','Sim:N+S1+S2'),
+            col=c('black','green','red','blue'),lty=c(1,1,1,1),bty='n')
+    }
+    abline(v=log10(0+1),lty=2,col='gray')
+    abline(v=log10(1+1),lty=2,col='gray')
+    abline(v=log10(2+1),lty=2,col='gray')
+    axis(1,0:6,10^c(0:6)-1)
+    axis(2,0:10,10^c(0:10))
+
+    #return(list(Y_sim_val = Y_sim_val, Y_sim_freq = Y_sim_freq))
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_reportSummary.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,120 @@
+
+.reportSummary <- function( summaryFile, resultList,
+    chipFile=NULL, chipFileFormat=NULL,
+    controlFile=NULL, controlFileFormat=NULL,
+    binfileDir=NULL,
+    peakFile=NULL, peakFileFormat=NULL,
+    byChr=FALSE, FDR=0.05, fragLen=200, binSize=fragLen, capping=0,
+    analysisType="IO", d=0.25,
+    signalModel="BIC", maxgap=fragLen, minsize=50, thres=10 ) {
+
+    cat( "MOSAiCS: Summary of model fitting and peak calling\n", file=summaryFile )
+    cat( "\n", file=summaryFile, append=TRUE )
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "Input/output file settings\n", file=summaryFile, append=TRUE )
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    cat( "Name of aligned read file (ChIP): ",chipFile,"\n", sep="",
+        file=summaryFile, append=TRUE )
+    cat( "Aligned read file format (ChIP):", chipFileFormat, "\n",
+        file=summaryFile, append=TRUE )
+
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    cat( "Name of aligned read file (control): ",controlFile,"\n", sep="",
+        file=summaryFile, append=TRUE )
+    cat( "Aligned read file format (control):", controlFileFormat, "\n",
+        file=summaryFile, append=TRUE )
+
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    for ( ff in 1:length(peakFileFormat) ) {
+        cat( "Name of peak result file: ",peakFile[ff],"\n", sep="",
+            file=summaryFile, append=TRUE )
+        cat( "Peak result file format:", peakFileFormat[ff], "\n",
+            file=summaryFile, append=TRUE )
+
+        cat( "\n", file=summaryFile, append=TRUE )
+    }
+
+    #cat( "\n", file=summaryFile, append=TRUE )
+
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "Parameter settings\n", file=summaryFile, append=TRUE )
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    if ( byChr ) {
+        cat( "Genome-wide or chromosome-wise analysis? Chromosome-wise analysis\n",
+            file=summaryFile, append=TRUE )
+    } else  {
+        cat( "Genome-wide or chromosome-wise analysis? Genome-wide analysis\n",
+            file=summaryFile, append=TRUE )
+    }
+
+    if ( analysisType=="OS" ) {
+        analysisTypeOut <- "One-sample analysis"
+    } else if ( analysisType=="TS" ) {
+        analysisTypeOut <- "Two-sample analysis (with mappability & GC content)"
+    } else if ( analysisType=="IO" ) {
+        analysisTypeOut <- "Two-sample analysis (Input only)"
+    }
+
+    if ( signalModel=="BIC" ) {
+        signalModelOut <- "Automatic signal model selection based on BIC"
+    } else if ( signalModel=="1S" ) {
+        signalModelOut <- "One-signal-component model"
+    } else if ( signalModel=="2S" ) {
+        signalModelOut <- "Two-signal-component model"
+    }
+
+    cat( "False discovery rate (FDR):", FDR, "\n", file=summaryFile, append=TRUE )
+    cat( "Fragment length:", fragLen, "\n", file=summaryFile, append=TRUE )
+    cat( "Bin size:", binSize, "\n", file=summaryFile, append=TRUE )
+    if ( capping > 0 ) {
+        cat( "Maximum number of reads allowed in each nucleotide:", capping, "\n",
+            file=summaryFile, append=TRUE )
+    }
+    cat( "Analysis type:", analysisTypeOut, "\n", file=summaryFile, append=TRUE )
+    cat( "d:", d, "\n", file=summaryFile, append=TRUE )
+    cat( "Signal model:", signalModelOut, "\n", file=summaryFile, append=TRUE )
+    cat( "maxgap:", maxgap, "\n", file=summaryFile, append=TRUE )
+    cat( "minsize:", minsize, "\n", file=summaryFile, append=TRUE )
+    cat( "thres:", thres, "\n", file=summaryFile, append=TRUE )
+
+    cat( "\n", file=summaryFile, append=TRUE )
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "Peak calling summary\n", file=summaryFile, append=TRUE )
+    cat( "------------------------------------------------------------\n",
+        file=summaryFile, append=TRUE )
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    outFormat <- data.frame(
+        resultList$chrID, resultList$n_peaks,
+        resultList$peak_width, resultList$opt_sig_model,
+        stringsAsFactors=FALSE )
+    colnames(outFormat) <- c( "chrID", "# peaks",
+        "Median peak width", "Optimal/specified signal model" )
+
+    cat( as.character(colnames(outFormat)), file=summaryFile, sep="\t", append=TRUE )
+    cat( "\n", file=summaryFile, append=TRUE )
+    cat( rep("-----",3), file=summaryFile, sep="\t", append=TRUE )
+    cat( "\t\t\t-----", file=summaryFile, sep="\t", append=TRUE )
+    cat( "\n", file=summaryFile, append=TRUE )
+
+    # peak list
+
+    for ( i in 1:nrow(outFormat) )
+    {
+        cat( as.character(outFormat[i,])[1:3], file=summaryFile, sep="\t", append=TRUE )
+        cat( "\t\t", as.character(outFormat[i,])[4],
+            file=summaryFile, sep="\t", append=TRUE )
+        cat( "\n", file=summaryFile, append=TRUE )
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_rlmFit_IO.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,59 @@
+
+.rlmFit_IO <- function( parEst, d=0.25, Y, X, inputTrunc )
+{
+    #library(MASS)
+
+
+    # exclude NA & MM points
+
+    if ( length(which(is.na(parEst$a_u)==TRUE))==length(parEst$a_u) ) {
+        stop( "insufficient # of proper strata! Cannot proceed!" )
+    }
+
+    #idNA <- unique( c( which(is.na(parEst$a_u)==TRUE), which(as.character(parEst$ty_u)=='MM') ) )
+    idNA <- which( is.na(parEst$a_u) )
+    if ( length(idNA)>0 ) {
+        a_u <- parEst$a_u[-idNA]
+        mu_u <- parEst$a_u[-idNA] / parEst$b_u[-idNA]
+        X_u <- parEst$X_u[-idNA]
+        n_u <- parEst$n_u[-idNA]
+        mean0 <- parEst$mean0_u[-idNA]
+        var0 <- parEst$var0_u[-idNA]
+    } else {
+        a_u <- parEst$a_u
+        mu_u <- parEst$a_u / parEst$b_u
+        X_u <- parEst$X_u
+        n_u <- parEst$n_u
+        mean0 <- parEst$mean0_u
+        var0 <- parEst$var0_u
+    }
+    #trunc <- parEst$trunc
+
+    # estimators of a
+
+    a_w_strata <- sum(n_u*a_u)/sum(n_u)         # [Note] No truncation for input only analysis
+
+
+    # sample size weighted rlm fit
+
+    #X[ which(X>trunc) ] <- trunc                  # [Note] we need truncation of X for input only analysis
+    X[ which(X>inputTrunc) ] <- inputTrunc
+
+    Xtrans <- X_u^d
+    fit_trunc_adj <- rlm( log(mu_u) ~ Xtrans, weights=n_u/sum(n_u) )
+
+
+    # return estimates
+
+    coef_rlm <- coef(fit_trunc_adj)
+    muEst <- exp( coef_rlm[1] + coef_rlm[2]*(X^d) )
+
+    pi0 <- .getPi0( muEst, a_w_strata, parEst$Y_freq )
+
+    betaEst <- coef(fit_trunc_adj)
+    names(betaEst) <- c( "(intercept)", "input" )
+
+    return( list( pi0 = pi0, a = a_w_strata, muEst=muEst, betaEst=betaEst,
+        Y_val=parEst$Y_val, Y_freq=parEst$Y_freq ))
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_rlmFit_OS.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,95 @@
+
+.rlmFit_OS <- function( parEst, mean_thres=0, Y, M, GC )
+{
+    #library(MASS)
+    #library(splines)
+
+
+    # exclude NA & MM points
+
+    if ( length(which(is.na(parEst$a_u)==TRUE))==length(parEst$a_u) ) {
+        stop( "insufficient # of proper strata! Cannot proceed!" )
+    }
+
+    # do not excluded estimtes from MOM (ver 1.0.2)
+
+    #idNA <- unique( c( which(is.na(parEst$a_u)==TRUE), which(as.character(parEst$ty_u)=='MM') ) )
+    #idNA <- which( is.na(parEst$a_u) | as.character(parEst$ty_u)=='MM' )
+    idNA <- which( is.na(parEst$a_u) )
+    if ( length(idNA)>0 ) {
+        a_u <- parEst$a_u[-idNA]
+        mu_u <- parEst$a_u[-idNA] / parEst$b_u[-idNA]
+        M_u <- parEst$M_u[-idNA]
+        GC_u <- parEst$GC_u[-idNA]
+        n_u <- parEst$n_u[-idNA]
+        mean0 <- parEst$mean0_u[-idNA]
+        var0 <- parEst$var0_u[-idNA]
+    } else {
+        a_u <- parEst$a_u
+        mu_u <- parEst$a_u / parEst$b_u
+        M_u <- parEst$M_u
+        GC_u <- parEst$GC_u
+        n_u <- parEst$n_u
+        mean0 <- parEst$mean0_u
+        var0 <- parEst$var0_u
+    }
+
+
+    # exclude mu>Yhat points
+
+    Y_freq <- table(Y)
+
+    if( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ])/sum(Y_freq) > 0.5 ) {
+	id_bigmean <- which( ( mu_u - mean0 ) > mean_thres )
+	if( length(id_bigmean)>0 ) {
+		a_u <- a_u[-id_bigmean]
+		mu_u <- mu_u[-id_bigmean]
+		M_u <- M_u[-id_bigmean]
+		GC_u <- GC_u[-id_bigmean]
+		n_u <- n_u[-id_bigmean]
+		mean0 <- mean0[-id_bigmean]
+		var0 <- var0[-id_bigmean]
+	}
+    }
+
+
+    # estimators of a
+
+    if( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ])/sum(Y_freq) > 0.5 ) {
+	t_n_sq <- length(a_u)
+
+	idNA <- which( a_u>quantile(a_u,prob=0.95) | a_u<quantile(a_u,prob=0.05) )
+	if(length(idNA)>0){
+		a_u_wq <- a_u[-idNA]
+		n_u_wq <- n_u[-idNA]
+		a_wq_strata <- sum(n_u_wq*a_u_wq) / sum(n_u_wq)
+	}
+    } else {
+    	a_wq_strata <- sum(n_u*a_u)/sum(n_u)
+    }
+
+    # sample size weighted rlm fit
+
+    GC_knots <- c(quantile(GC_u,prob=0.25),quantile(GC_u,prob=0.75))
+    GC_u_bs <- bs(GC_u,degree=1,knots=GC_knots)           # [Note_2] different GC_knots
+    GC_bs <- bs( GC, knots=GC_knots, degree=1 )
+
+    fit_trunc_adj <- rlm( log(mu_u) ~ log2(M_u+1) + GC_u_bs, weights=n_u/sum(n_u) )
+    #print(fit_trunc_adj)
+
+
+    # return estimates
+
+    coef_rlm <- coef(fit_trunc_adj)
+    muEst <- exp( coef_rlm[1] + coef_rlm[2]*log2(M+1) +
+        coef_rlm[3]*GC_bs[,1] + coef_rlm[4]*GC_bs[,2] + coef_rlm[5]*GC_bs[,3] )
+
+    pi0 <- .getPi0( muEst, a_wq_strata, parEst$Y_freq )
+
+    betaEst <- coef(fit_trunc_adj)
+    names(betaEst) <- c( "(intercept)", "log2(M+1)", "spline(GC)_1", "spline(GC)_2", "spline(GC)_3" )
+
+    return( list( pi0 = pi0, a = a_wq_strata, muEst=muEst, betaEst=betaEst,
+        Y_val=parEst$Y_val, Y_freq=parEst$Y_freq ))
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/base_rlmFit_TS.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,120 @@
+
+.rlmFit_TS  <- function( parEst, mean_thres=1, s=2, d=0.25, Y, M, GC, X )
+{
+    #library(MASS)
+    #library(splines)
+
+
+    # exclude NA & MM points
+
+    if ( length(which(is.na(parEst$a_u)==TRUE))==length(parEst$a_u) ) {
+        stop( "insufficient # of proper strata! Cannot proceed!" )
+    }
+
+    # do not excluded estimtes from MOM (ver 1.0.2)
+
+    #idNA <- unique(c(which(is.na(parEst$a_u)==TRUE),which(as.character(parEst$ty_u)=='MM')))
+    #idNA <- which( is.na(parEst$a_u) | as.character(parEst$ty_u)=='MM' )
+    idNA <- which( is.na(parEst$a_u) )
+    if ( length(idNA)>0 ) {
+        a_u <- parEst$a_u[-idNA]
+        mu_u <- parEst$a_u[-idNA] / parEst$b_u[-idNA]
+        X_u <- parEst$X_u[-idNA]
+        M_u <- parEst$M_u[-idNA]
+        GC_u <- parEst$GC_u[-idNA]
+        n_u <- parEst$n_u[-idNA]
+        mean0 <- parEst$mean0_u[-idNA]
+        var0 <- parEst$var0_u[-idNA]
+    } else {
+        a_u <- parEst$a_u
+        mu_u <- parEst$a_u / parEst$b_u
+        X_u <- parEst$X_u
+        M_u <- parEst$M_u
+        GC_u <- parEst$GC_u
+        n_u <- parEst$n_u
+        mean0 <- parEst$mean0_u
+        var0 <- parEst$var0_u
+    }
+
+
+    # exclude mu>Yhat points
+
+    id_bigmean <- which( ( mu_u - mean0 ) > mean_thres )  # [Note_1] mean_thres=1
+    if( length(id_bigmean)>0 ) {
+        a_u <- a_u[-id_bigmean]
+        mu_u <- mu_u[-id_bigmean]
+        X_u <- X_u[-id_bigmean]
+        M_u <- M_u[-id_bigmean]
+        GC_u <- GC_u[-id_bigmean]
+        n_u <- n_u[-id_bigmean]
+        mean0 <- mean0[-id_bigmean]
+        var0 <- var0[-id_bigmean]
+    }
+
+
+    # estimators of a
+
+    a_wq_strata <- sum(n_u*a_u)/sum(n_u)
+
+    #idNA <- unique(c(which(a_u>quantile(a_u,prob=0.95)),which(a_u<quantile(a_u,prob=0.05))))
+    idNA <- which( a_u>quantile(a_u,prob=0.95) | a_u<quantile(a_u,prob=0.05) )
+    if( length(idNA)>0 ) {
+        a_u_wq <- a_u[-idNA]
+        n_u_wq <- n_u[-idNA]
+        a_wq_strata <- sum(n_u_wq*a_u_wq) / sum(n_u_wq) ### Note using a_wq_strata
+    }
+
+
+    # sample size weighted rlm fit
+
+    GC_knots <- c(quantile(GC_u,prob=0.25),quantile(GC_u,prob=0.75))
+    GC_bs <- bs(GC_u,degree=1,knots=GC_knots)           # [Note_2] different GC_knots
+
+    IX <- rep(0,length(X_u))
+    IX[ X_u <= s ] <- 1
+
+    # exception handling: no 0, 1, 2 counts in input
+    if ( length(which( IX==0 )) == length(IX) ) {
+        stop( paste("insufficient # of bins with counts <=", s, "in control sample. Please try input-only analysis!") )
+    }
+
+    MX_u <- log2(M_u+1) * IX
+    GC1X_u <- GC_bs[,1] * IX
+    GC2X_u <- GC_bs[,2] * IX
+    GC3X_u <- GC_bs[,3] * IX
+    SmallX_u <- (X_u^d) * IX
+    BigX_u <- (X_u^d) * (1-IX)
+
+    fit_trunc_adj <- rlm( log(mu_u) ~ SmallX_u + BigX_u + MX_u +
+        GC1X_u + GC2X_u + GC3X_u, weights=n_u/sum(n_u) )
+
+
+    # return estimates
+
+    GC_bs <- bs( GC, knots=GC_knots, degree=1 )
+
+    IX_all <- rep(0,length(X))
+    IX_all[ X <= s ] <- 1
+
+    MX_u_all <- log2(M+1) * IX_all
+    GC1X_u_all <- GC_bs[,1] * IX_all
+    GC2X_u_all <- GC_bs[,2] * IX_all
+    GC3X_u_all <- GC_bs[,3] * IX_all
+    SmallX_u_all <- (X^d) * IX_all
+    BigX_u_all <- (X^d) * (1-IX_all)
+
+
+    coef_rlm <- coef(fit_trunc_adj)
+    muEst <- exp( coef_rlm[1] + coef_rlm[2]*SmallX_u_all + coef_rlm[3]*BigX_u_all +
+        coef_rlm[4]*MX_u_all +
+        coef_rlm[5]*GC1X_u_all + coef_rlm[6]*GC2X_u_all + coef_rlm[7]*GC3X_u_all )
+
+    pi0 <- .getPi0( muEst, a_wq_strata, parEst$Y_freq )
+
+    betaEst <- coef(fit_trunc_adj)
+    names(betaEst) <- c( "(intercept)", "control(small)", "control(large)", "log2(M+1)",
+        "spline(GC)_1", "spline(GC)_2", "spline(GC)_3" )
+
+    return( list( pi0 = pi0, a = a_wq_strata, muEst=muEst, betaEst=betaEst,
+            Y_val = parEst$Y_val, Y_freq = parEst$Y_freq ) )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/constructBins.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,178 @@
+
+# read alignment files and construct bin-level files
+
+constructBins <- function( infile=NULL, fileFormat=NULL, outfileLoc="./",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    PET=FALSE, fragLen=200, binSize=200, capping=0, perl = "perl" )
+{
+    # preprocessing perl script embedded in "mosaics/inst/Perl/"
+
+    if ( PET == TRUE ) {
+        script <- "process_readfiles_PET.pl"
+        allFormat <- c( "eland_result", "sam" )
+        allFormatName <- c( "Eland result", "SAM" )
+    } else {
+        script <- "process_readfiles_SET.pl"
+        allFormat <- c( "eland_result", "eland_extended", "eland_export",
+            "bowtie", "sam", "bed", "csem" )
+        allFormatName <- c( "Eland result", "Eland extended", "Eland export",
+            "Bowtie default", "SAM", "BED", "CSEM" )
+    }
+
+    # Check whether perl exists
+
+    CMD <- paste( perl, "-v" )
+    res <- system( CMD, intern = TRUE, ignore.stderr = TRUE )
+
+    if ( length(res) == 0 ) {
+        # cannot proceed if perl does not exist
+
+        stop( "Perl is not found on your system! Either check $PATH if installed or please install Perl." )
+    } else {
+        # process read files into bin-level files if perl exists
+
+        # check whether minimal options are missing
+
+        if ( length(infile) != 1 || is.null(infile) )
+        {
+            stop( "Please specify the name of the aligned read file!" )
+        }
+
+        if ( length(fileFormat) != 1 || is.null(fileFormat) )
+        {
+            stop( "Please specify aligned read file format! Read '?constructBins' for supported file formats" )
+        }
+
+        # check file format specification
+
+        if ( length(which(!is.na(match( fileFormat, allFormat )))) == 0 )
+        {
+            stop( "Unsupported aligned read file format! Read '?constructBins' for supported file formats" )
+        }
+
+        # if useChrfile is TRUE & excludeChr is NOT null, then ignore excludeChr
+
+        if ( useChrfile & !is.null(excludeChr) ) {
+            message( "User set 'useChrfile' as TRUE and also provided 'excludeChr'." )
+            message( "'excludeChr' argument will be ignored." )
+            excludeChr <- NULL
+        }
+
+        # print out processing settings:
+        # by default, set fragment length = 200, bin size = fragment length, capping = 0.
+
+        fileFormatName <- allFormatName[ match( fileFormat, allFormat ) ]
+
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: setting summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "Name of aligned read file:", infile, "\n" )
+        cat( "Aligned read file format:", fileFormatName, "\n" )
+        cat( "Directory of processed bin-level files:", outfileLoc, "\n" )
+        cat( "Construct bin-level files by chromosome?", ifelse(byChr,"Y","N"), "\n" )
+        cat( "Is file for chromosome info provided?", ifelse(useChrfile,"Y","N"), "\n" )
+        if ( useChrfile == TRUE ) {
+            cat( "Name of file for chromosome info: ", chrfile, "\n" )
+        }
+        if ( !is.null(excludeChr) ) {
+            cat( "List of chromosomes to be excluded:", paste(excludeChr,collapse=", "), "\n" )
+        }
+        if ( PET == FALSE ) {
+            cat( "Data type: Single-end tag (SET)\n" )
+            cat( "Average fragment length:", fragLen, "\n" )
+        } else {
+            cat( "Data type: Paired-end tag (PET)\n" )
+        }
+        cat( "Bin size:", binSize, "\n" )
+        if ( capping > 0 ) {
+            cat( "Maximum number of reads allowed in each nucleotide:", capping, "\n" )
+        }
+        cat( "------------------------------------------------------------\n" )
+
+
+        # get path to the perl code (unified script for all file formats)
+
+        Fn.Path <- system.file( file.path("Perl",script), package="mosaics")
+
+
+        # process read file to bin-level files using perl codes
+
+        message( "Info: reading the aligned read file and processing it into bin-level files..." )
+
+        if ( capping <= 0 ) {
+            capping <- 0
+        }
+        if ( is.null(excludeChr) ) {
+            excludeChrVec <- ""
+        } else {
+            excludeChrVec <- paste( excludeChr, collapse=" " )
+        }
+
+        if ( PET == TRUE ) {
+            CMD <- paste( perl,
+                " ", Fn.Path,
+                " ", infile,
+                " ", outfileLoc,
+                " ", fileFormat,
+                " ", binSize,
+                " ", capping,
+                " ", ifelse(byChr,"Y","N"),
+                " ", ifelse(useChrfile,"Y","N"),
+                " ", ifelse(!is.null(chrfile),chrfile,"-"),
+                " ", paste(excludeChrVec,collpase=" "), sep="" )
+        } else {
+            CMD <- paste( perl,
+                " ", Fn.Path,
+                " ", infile,
+                " ", outfileLoc,
+                " ", fileFormat,
+                " ", fragLen,
+                " ", binSize,
+                " ", capping,
+                " ", ifelse(byChr,"Y","N"),
+                " ", ifelse(useChrfile,"Y","N"),
+                " ", ifelse(!is.null(chrfile),chrfile,"-"),
+                " ", paste(excludeChrVec,collpase=" "), sep="" )
+        }
+
+        res <- system( CMD, intern = TRUE )
+
+        message( "Info: done!" )
+
+
+        # print out processing results
+
+        infilename <- basename( infile )
+            # extract only filename from infile
+
+        if ( PET == TRUE ) {
+            if ( byChr ) {
+                outfileName <- list.files( path=outfileLoc,
+                    pattern=paste(infilename,"_bin",binSize,"_.*.txt",sep="") )
+            } else {
+                outfileName <- paste(infilename,"_bin",binSize,".txt",sep="")
+            }
+        } else {
+            if ( byChr ) {
+                outfileName <- list.files( path=outfileLoc,
+                    pattern=paste(infilename,"_fragL",fragLen,"_bin",binSize,"_.*.txt",sep="") )
+            } else {
+                outfileName <- paste(infilename,"_fragL",fragLen,"_bin",binSize,".txt",sep="")
+            }
+        }
+
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: processing summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "Directory of processed bin-level files:", outfileLoc, "\n" )
+        if ( byChr ) {
+            cat( "List of processed bin-level files:\n" )
+            for ( i in 1:length(outfileName) ) {
+                cat( "- ",outfileName[i],"\n", sep="" )
+            }
+        } else {
+            cat( "Processed bin-level file: ",outfileName,"\n", sep="" )
+        }
+        cat( "------------------------------------------------------------\n" )
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/generateWig.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,181 @@
+
+# read alignment files and construct bin-level files
+
+generateWig <- function( infile=NULL, fileFormat=NULL, outfileLoc="./",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    PET=FALSE, fragLen=200, span=200, capping=0, normConst=1, perl = "perl" )
+{
+    # preprocessing perl script embedded in "mosaics/inst/Perl/"
+
+    if ( PET == TRUE ) {
+        script <- "readfile2wig_PET.pl"
+        allFormat <- c( "eland_result", "sam" )
+        allFormatName <- c( "Eland result", "SAM" )
+    } else {
+        script <- "readfile2wig_SET.pl"
+        allFormat <- c( "eland_result", "eland_extended", "eland_export",
+            "bowtie", "sam", "bed", "csem" )
+        allFormatName <- c( "Eland result", "Eland extended", "Eland export",
+            "Bowtie default", "SAM", "BED", "CSEM" )
+    }
+
+    # Check whether perl exists
+
+    CMD <- paste( perl, "-v" )
+    res <- system( CMD, intern = TRUE, ignore.stderr = TRUE )
+
+    if ( length(res) == 0 ) {
+        # cannot proceed if perl does not exist
+
+        stop( "Perl is not found on your system! Either check $PATH if installed or please install Perl." )
+    } else {
+        # process read files into bin-level files if perl exists
+
+        # check whether minimal options are missing
+
+        if ( length(infile) != 1 || is.null(infile) )
+        {
+            stop( "Please specify the name of the aligned read file!" )
+        }
+
+        if ( length(fileFormat) != 1 || is.null(fileFormat) )
+        {
+            stop( "Please specify aligned read file format! Read '?generateWig' for supported file formats" )
+        }
+
+        # check file format specification
+
+        if ( length(which(!is.na(match( fileFormat, allFormat )))) == 0 )
+        {
+            stop( "Unsupported aligned read file format! Read '?generateWig' for supported file formats" )
+        }
+
+        # if useChrfile is TRUE & excludeChr is NOT null, then ignore excludeChr
+
+        if ( useChrfile & !is.null(excludeChr) ) {
+            message( "User set 'useChrfile' as TRUE and also provided 'excludeChr'." )
+            message( "'excludeChr' argument will be ignored." )
+            excludeChr <- NULL
+        }
+
+        # print out processing settings:
+        # by default, set fragment length = 200, bin size = fragment length, capping = 0.
+
+        fileFormatName <- allFormatName[ match( fileFormat, allFormat ) ]
+
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: setting summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "Name of aligned read file:", infile, "\n" )
+        cat( "Aligned read file format:", fileFormatName, "\n" )
+        cat( "Directory of processed wig files:", outfileLoc, "\n" )
+        cat( "span of the wig files:", span, "\n" )
+        cat( "Normalizing constant:", normConst, "\n" )
+        cat( "Construct wig files by chromosome?", ifelse(byChr,"Y","N"), "\n" )
+        cat( "Is file for chromosome info provided?", ifelse(useChrfile,"Y","N"), "\n" )
+        if ( useChrfile == TRUE ) {
+            cat( "Name of file for chromosome info: ", chrfile, "\n" )
+        }
+        if ( !is.null(excludeChr) ) {
+            cat( "List of chromosomes to be excluded:", paste(excludeChr,collapse=", "), "\n" )
+        }
+        if ( PET == FALSE ) {
+            cat( "Data type: Single-end tag (SET)\n" )
+            cat( "Average fragment length:", fragLen, "\n" )
+        } else {
+            cat( "Data type: Paired-end tag (PET)\n" )
+        }
+        if ( capping > 0 ) {
+            cat( "Maximum number of reads allowed in each nucleotide:", capping, "\n" )
+        }
+        cat( "------------------------------------------------------------\n" )
+
+
+        # get path to the perl code (unified script for all file formats)
+
+        Fn.Path <- system.file( file.path("Perl",script), package="mosaics")
+
+
+        # process read file to bin-level files using perl codes
+
+        message( "Info: reading the aligned read file and processing it into bin-level files..." )
+
+        if ( capping <= 0 ) {
+            capping <- 0
+        }
+        if ( is.null(excludeChr) ) {
+            excludeChrVec <- ""
+        } else {
+            excludeChrVec <- paste( excludeChr, collapse=" " )
+        }
+
+        if ( PET == TRUE ) {
+            CMD <- paste( perl,
+                " ", Fn.Path,
+                " ", infile,
+                " ", outfileLoc,
+                " ", fileFormat,
+                " ", span,
+                " ", normConst,
+                " ", capping,
+                " ", ifelse(byChr,"Y","N"),
+                " ", ifelse(useChrfile,"Y","N"),
+                " ", ifelse(!is.null(chrfile),chrfile,"-"),
+                " ", paste(excludeChrVec,collpase=" "), sep="" )
+        } else {
+            CMD <- paste( perl,
+                " ", Fn.Path,
+                " ", infile,
+                " ", outfileLoc,
+                " ", fileFormat,
+                " ", span,
+                " ", normConst,
+                " ", fragLen,
+                " ", capping,
+                " ", ifelse(byChr,"Y","N"),
+                " ", ifelse(useChrfile,"Y","N"),
+                " ", ifelse(!is.null(chrfile),chrfile,"-"),
+                " ", paste(excludeChrVec,collpase=" "), sep="" )
+        }
+
+        res <- system( CMD, intern = TRUE )
+
+        message( "Info: done!" )
+
+
+        # print out processing results
+
+        infilename <- basename( infile )
+            # extract only filename from infile
+
+        if ( PET == TRUE ) {
+            if ( byChr ) {
+                outfileName <- list.files( path=outfileLoc,
+                    pattern=paste(infilename,"_span",span,"_.*.wig",sep="") )
+            } else {
+                outfileName <- paste(infilename,"_span",span,".wig",sep="")
+            }
+        } else {
+            if ( byChr ) {
+                outfileName <- list.files( path=outfileLoc,
+                    pattern=paste(infilename,"_fragL",fragLen,"_span",span,"_.*.wig",sep="") )
+            } else {
+                outfileName <- paste(infilename,"_fragL",fragLen,"_span",span,".wig",sep="")
+            }
+        }
+
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: processing summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "Directory of processed wig files:", outfileLoc, "\n" )
+        if ( byChr ) {
+            cat( "List of processed wig files:\n" )
+            for ( i in 1:length(outfileName) ) {
+                cat( "- ",outfileName[i],"\n", sep="" )
+            }
+        } else {
+            cat( "Processed wig file: ",outfileName,"\n", sep="" )
+        }
+        cat( "------------------------------------------------------------\n" )
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/methods_BinData.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,326 @@
+
+# generic methods for "BinData" class
+
+setMethod(
+    f="show",
+    signature="BinData",
+    definition=function( object ) {
+        # genome-wide summary
+
+        chrList <- sort(unique( chrID(object) ))
+
+        cat( "Summary: bin-level data (class: BinData)\n" )
+        cat( "----------------------------------------\n" )
+        cat( "- # of chromosomes in the data: ", length(chrList), "\n", sep="" )
+        cat( "- total effective tag counts: ", sum(tagCount(object)), "\n", sep="" )
+        cat( "  (sum of ChIP tag counts of all bins)\n" )
+        if ( length(object@input)>0 )
+        {
+            cat( "- control sample is incorporated\n" )
+        } else
+        {
+            cat( "- control sample is NOT incorporated\n" )
+        }
+        if ( length(object@mappability)>0 )
+        {
+            cat( "- mappability score is incorporated\n" )
+        } else
+        {
+            cat( "- mappability score is NOT incorporated\n" )
+        }
+        if ( length(object@gcContent)>0 )
+        {
+            cat( "- GC content score is incorporated\n" )
+        } else
+        {
+            cat( "- GC content score is NOT incorporated\n" )
+        }
+        switch( object@dataType,
+            unique = {
+                cat( "- uni-reads are assumed\n" )
+            },
+            multi = {
+                cat( "- both uni-reads and multi-reads are assumed\n" )
+            }
+        )
+        cat( "----------------------------------------\n" )
+
+        # chromosome-wise summary
+
+        if ( length(chrList) > 1 ) {
+            coordByChr <- split( coord(object), chrID(object) )
+            countByChr <- split( tagCount(object), chrID(object) )
+
+            cat( "chrID:\t# of bins, total effective tag counts\n" )
+            for ( chr in 1:length(chrList) ) {
+                nCord <- length( coordByChr[[ chrList[chr] ]] )
+                nCount <- sum( countByChr[[ chrList[chr] ]] )
+                cat( chrList[chr],":\t",nCord,", ",nCount,"\n", sep="" )
+            }
+            cat( "----------------------------------------\n" )
+
+            rm( coordByChr, countByChr )
+            gc()
+        }
+    }
+)
+
+setMethod(
+    f="print",
+    signature="BinData",
+    definition=function( x ) {
+        if ( length(x@input)>0 )
+        {
+            if ( length(x@mappability)>0 & length(x@gcContent)>0 )
+            {
+                # two-sample analysis (with M & GC)
+
+                printForm <- data.frame(
+                    chrID=chrID(x), coord=coord(x), tagCount=tagCount(x),
+                    mappability=mappability(x), gcContent=gcContent(x),
+                    input=input(x) )
+            } else
+            {
+                # two-sample analysis (Input only)
+
+                printForm <- data.frame(
+                    chrID=chrID(x), coord=coord(x), tagCount=tagCount(x),
+                    input=input(x) )
+            }
+        } else
+        {
+            # one-sample analysis
+
+            printForm <- data.frame(
+                chrID=chrID(x), coord=coord(x), tagCount=tagCount(x),
+                mappability=mappability(x), gcContent=gcContent(x) )
+        }
+        return(printForm)
+    }
+)
+
+setMethod(
+    f="plot",
+    signature=c("BinData","missing"),
+    definition=function( x, y, plotType=NULL, inputGrid=c(-1:10,15,20,30,50,100), ... ) {
+        if ( !is.null(plotType) ) {
+            if ( plotType=="M" ) {
+                # plot mean tag count vs. mappability, with 95% CI
+
+                if ( length(x@mappability)>0 ) {
+                    statM <- .computeStat( Y=tagCount(x), S=mappability(x) )
+                    plot( statM$uS, statM$meanYall,
+                        xlab='Mappability score', ylab='Mean ChIP tag count',
+                        main='Mappability score vs. Mean ChIP tag count',
+                        ylim=quantile( statM$meanYall, prob=c(0.05,0.95) ) )
+                    segments( statM$uS, statM$meanYall,
+                        statM$uS, statM$meanYall+1.96*sqrt(statM$varYall/statM$nitem) )
+                    segments( statM$uS, statM$meanYall,
+                        statM$uS, statM$meanYall-1.96*sqrt(statM$varYall/statM$nitem) )
+                } else {
+                    stop( "bin-level data does not include mappability score information!" )
+                }
+            } else if ( plotType=="GC" ) {
+                # plot mean tag count vs. GC content, with 95% CI
+
+                if ( length(x@gcContent)>0 ) {
+                    statGC <- .computeStat( Y=tagCount(x), S=gcContent(x) )
+                    plot( statGC$uS, statGC$meanYall,
+                        xlab='GC content score', ylab='Mean ChIP tag count',
+                        main='GC content score vs. Mean ChIP tag count',
+                        ylim=quantile( statGC$meanYall, prob=c(0.05,0.95) ) )
+                    segments( statGC$uS, statGC$meanYall,
+                        statGC$uS, statGC$meanYall+1.96*sqrt(statGC$varYall/statGC$nitem) )
+                    segments( statGC$uS, statGC$meanYall,
+                        statGC$uS, statGC$meanYall-1.96*sqrt(statGC$varYall/statGC$nitem) )
+                } else {
+                    stop( "bin-level data does not include GC content score information!" )
+                }
+            } else if ( plotType=="input" ) {
+                # plot mean tag count vs. input mean tag count, with 95% CI
+
+                if ( length(x@input)>0 ) {
+                    statInput <- .computeStat( Y=tagCount(x), S=input(x) )
+                    plot( statInput$uS, statInput$meanYall,
+                        xlab='Control tag count', ylab='Mean ChIP tag count',
+                        main='Control tag count vs. Mean ChIP tag count',
+                        xlim=c(0,quantile(input(x),0.9999)) )
+                    segments( statInput$uS, statInput$meanYall,
+                        statInput$uS,
+                        statInput$meanYall+1.96*sqrt(statInput$varYall/statInput$nitem) )
+                    segments( statInput$uS, statInput$meanYall,
+                        statInput$uS,
+                        statInput$meanYall-1.96*sqrt(statInput$varYall/statInput$nitem) )
+                } else {
+                    stop( "bin-level data does not include control tag count information!" )
+                }
+            } else if ( plotType=="M|input" ) {
+                # plot mean tag count vs. mappability, conditional on input
+
+                if ( length(x@input)>0 ) {
+                    #library(lattice)
+
+                    M_all <- X_all <- Y_all <- input_mat <- c()
+
+                    for( i in 1:(length(inputGrid)-1) ) {
+                        sub_id <- which( input(x)>inputGrid[i] & input(x)<=inputGrid[(i+1)] )
+                        if ( length(sub_id) > 0 ) {
+                            statM <- .computeStat( Y=tagCount(x)[sub_id], S=mappability(x)[sub_id] )
+                            meanYall <- statM$meanYall
+
+                            MaxY <- 1
+                            if( length(meanYall) > 0 ) {
+                                MaxY <- max(meanYall)
+                            }
+                            if( MaxY==0 ) {
+                                MaxY <- 1
+                            }
+
+                            input_mat <- rbind( input_mat, inputGrid[i:(i+1)] )
+                            M_all <- c( M_all, statM$uS )
+                            Y_all <- c( Y_all, meanYall/MaxY )
+                            X_all <- c( X_all, rep(mean(inputGrid[i:(i+1)]),length(statM$uS)) )
+                        }
+                    }
+                    Input <- shingle( X_all, intervals=input_mat )
+                    print( xyplot( Y_all ~ M_all | Input,
+                        xlab='Mappability score', ylab='Mean ChIP tag count',
+                        main='Mappability score vs. Mean ChIP tag count,\nconditional on Control tag count', cex=0.5 ) )
+                } else {
+                    stop( "bin-level data does not include control tag count information!" )
+                }
+            } else if ( plotType=="GC|input" ) {
+                # plot mean tag count vs. GC content, conditional on input
+
+                if ( length(x@input)>0 ) {
+                    #library(lattice)
+
+                    GC_all <- X_all <- Y_all <- input_mat <- c()
+
+                    for( i in 1:(length(inputGrid)-1) ) {
+                        sub_id <- which( input(x)>inputGrid[i] & input(x)<=inputGrid[(i+1)] )
+                        if ( length(sub_id) > 0 ) {
+                            statGC <- .computeStat( Y=tagCount(x)[sub_id], S=gcContent(x)[sub_id] )
+                            meanYall <- statGC$meanYall
+
+                            MaxY <- 1
+                            if( length(meanYall) > 0 ) {
+                                MaxY <- max(meanYall)
+                            }
+                            if( MaxY==0 ) {
+                                MaxY <- 1
+                            }
+
+                            input_mat <- rbind( input_mat, inputGrid[i:(i+1)] )
+                            GC_all <- c( GC_all, statGC$uS )
+                            Y_all <- c( Y_all, meanYall/MaxY )
+                            X_all <- c( X_all, rep(mean(inputGrid[i:(i+1)]),length(statGC$uS)) )
+                        }
+                    }
+                    Input <- shingle( X_all, intervals=input_mat )
+                    print( xyplot( Y_all ~ GC_all | Input,
+                        xlab='GC content score', ylab='Mean ChIP tag count',
+                        main='GC content score vs. Mean ChIP tag count,\nconditional on Control tag count', cex=0.5 ) )
+                } else {
+                    stop( "bin-level data does not include control tag count information!" )
+                }
+            }
+        } else {
+            # if "plotType" is missing, just show histogram of tag counts (& input, if possible)
+
+            YFreq <- table( tagCount(x) )
+            YVal <- as.numeric( names(YFreq) )
+
+            if ( length(x@input)>0 ) {
+                XFreq <- table( input(x) )
+                XVal <- as.numeric( names(XFreq) )
+            }
+
+            plot( log10(YVal+1), log10(YFreq), type='l', axes=FALSE,
+                ylab='Frequency', xlab='Tag count', main='Histogram of tag count' )
+
+        if ( length(x@input)>0 ) {
+           points( log10(XVal+1), log10(XFreq), type='l', col='darkgray' )
+        }
+            axis(1,0:6,10^c(0:6)-1)
+            axis(2,0:10,10^c(0:10))
+
+            if ( length(x@input)>0 ) {
+                legend( 1, log10(max(YFreq)+1), c('ChIP','control'),
+                    col=c('black','darkgray'),lty=c(1,1),bty='n')
+            } else {
+                legend( 1, log10(max(YFreq)+1), c('ChIP'),
+                    col=c('black'),lty=1,bty='n')
+            }
+        }
+    }
+)
+
+setMethod(
+    f="chrID",
+    signature="BinData",
+    definition=function( object ) {
+        return(object@chrID)
+    }
+)
+
+setMethod(
+    f="coord",
+    signature="BinData",
+    definition=function( object ) {
+        return(object@coord)
+    }
+)
+
+setMethod(
+    f="tagCount",
+    signature="BinData",
+    definition=function( object ) {
+        return(object@tagCount)
+    }
+)
+
+setMethod(
+    f="mappability",
+    signature="BinData",
+    definition=function( object ) {
+        if ( length(object@mappability)>0 )
+        {
+            return(object@mappability)
+        } else
+        {
+            warning( "no mappability score information provided!" )
+            return(NULL)
+        }
+    }
+)
+
+setMethod(
+    f="gcContent",
+    signature="BinData",
+    definition=function( object ) {
+        if ( length(object@gcContent)>0 )
+        {
+            return(object@gcContent)
+        } else
+        {
+            warning( "no GC content score information provided!" )
+            return(NULL)
+        }
+    }
+)
+
+setMethod(
+    f="input",
+    signature="BinData",
+    definition=function( object ) {
+        if ( length(object@input)>0 )
+        {
+            return(object@input)
+        } else
+        {
+            warning( "no control sample information provided!" )
+            return(NULL)
+        }
+    }
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/methods_MosaicsFit.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,93 @@
+
+# generic methods for "MosaicsFit" class
+
+setMethod(
+    f="show",
+    signature="MosaicsFit",
+    definition=function( object ) {
+        mosaicsParam <- object@mosaicsParam
+
+        bic1S <- object@bic1S
+        bic2S <- object@bic2S
+
+        analysisType <- object@mosaicsEst@analysisType
+
+        cat( "Summary: MOSAiCS model fitting (class: MosaicsFit)\n" )
+        cat( "--------------------------------------------------\n" )
+        switch( analysisType,
+            OS = {
+                k <- mosaicsParam@k
+                meanThres <- mosaicsParam@meanThres
+
+                cat( "analysis type: one-sample analysis\n" )
+                cat( "parameters used: k = ", k, ", meanThres = ", meanThres, "\n", sep="" )
+                cat( "BIC of one-signal-component model = ", bic1S, "\n", sep="" )
+                cat( "BIC of two-signal-component model = ", bic2S, "\n", sep="" )
+            },
+            TS = {
+                k <- mosaicsParam@k
+                meanThres <- mosaicsParam@meanThres
+                s <- mosaicsParam@s
+                d <- mosaicsParam@d
+
+                cat( "analysis type: two-sample analysis (with mappability & GC content)\n" )
+                cat( "parameters used: k = ", k, ", meanThres = ", meanThres,
+                    ", s = ", s, ", d = ", d, "\n", sep="" )
+                cat( "BIC of one-signal-component model = ", bic1S, "\n", sep="" )
+                cat( "BIC of two-signal-component model = ", bic2S, "\n", sep="" )
+            },
+            IO = {
+                k <- mosaicsParam@k
+                d <- mosaicsParam@d
+
+                cat( "analysis type: two-sample analysis (Input only)\n" )
+                cat( "parameters used: k = ", k, ", d = ", d, "\n", sep="" )
+                cat( "BIC of one-signal-component model = ", bic1S, "\n", sep="" )
+                cat( "BIC of two-signal-component model = ", bic2S, "\n", sep="" )
+            }
+        )
+
+        cat( "--------------------------------------------------\n" )
+    }
+)
+
+setMethod(
+    f="print",
+    signature="MosaicsFit",
+    definition=function( x ) {
+        warning( "'print' method for 'MosaicsFit' class is not supported yet." )
+    }
+)
+
+setMethod(
+    f="plot",
+    signature="MosaicsFit",
+    definition=function( x ) {
+        if ( x@mosaicsEst@analysisType=="OS" ) {
+            .plotGOF( mosaicsEst=x@mosaicsEst, tagCount=x@tagCount, k=3 )
+        } else {
+            .plotGOF( mosaicsEst=x@mosaicsEst, tagCount=x@tagCount, input=x@input, k=3 )
+        }
+    }
+)
+
+setMethod(
+    f="estimates",
+    signature="MosaicsFit",
+    definition=function( object ) {
+        mosaicsEst <- object@mosaicsEst
+
+        list(
+            pi0=mosaicsEst@pi0,
+            a=mosaicsEst@a,
+            betaEst=mosaicsEst@betaEst,
+            b=mosaicsEst@b,
+            c=mosaicsEst@c,
+            p1=mosaicsEst@p1,
+            b1=mosaicsEst@b1,
+            c1=mosaicsEst@c1,
+            b2=mosaicsEst@b2,
+            c2=mosaicsEst@c2
+        )
+    }
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/methods_MosaicsPeak.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,149 @@
+
+# generic methods for "MosaicsPeak" class
+
+setMethod(
+    f="show",
+    signature="MosaicsPeak",
+    definition=function( object ) {
+
+        peakParam <- object@peakParam
+
+        analysisType <- peakParam@analysisType
+        signalModel <- peakParam@signalModel
+        FDR <- peakParam@FDR
+        maxgap <- peakParam@maxgap
+        minsize <- peakParam@minsize
+        thres <- peakParam@thres
+
+        cat( "Summary: MOSAiCS peak calling (class: MosaicsPeak)\n" )
+        cat( "--------------------------------------------------\n" )
+        cat( "final model: " )
+        switch( analysisType,
+            "OS" = {
+                cat( "one-sample analysis " )
+            },
+            "TS" = {
+                cat( "two-sample analysis (with M & GC) " )
+            },
+            "IO" = {
+                cat( "two-sample analysis (input only) " )
+            }
+        )
+        switch( signalModel,
+            "1S" = {
+                cat( "with one signal component\n" )
+            },
+            "2S" = {
+                cat( "with two signal components\n" )
+            }
+        )
+        cat( "setting: FDR = ", FDR, ", maxgap = ", maxgap,
+            ", minsize = ", minsize, ", thres = ", thres, "\n", sep="" )
+        if ( length( object@peakList$peakStart ) > 0 ) {
+            cat( "# of peaks = ", nrow(object@peakList), "\n", sep="" )
+            cat( "median peak width = ", median(object@peakList$peakSize), "\n", sep="" )
+            cat( "empirical FDR = ", round(empFDR(object)*10000)/10000, "\n", sep="" )
+        } else {
+            # exception handling (no peak case)
+
+            cat( "(no peak detected)\n" )
+        }
+        cat( "--------------------------------------------------\n" )
+    }
+)
+
+setMethod(
+    f="print",
+    signature="MosaicsPeak",
+    definition=function( x ) {
+        peakList <- x@peakList
+
+        if ( nrow(peakList) > 0 ) {
+            printForm <- x@peakList
+            return(printForm)
+        } else {
+            # exception handling (no peak case)
+
+            message( "Info: no peak detected. Nothing can be returned." )
+        }
+    }
+)
+
+setMethod(
+    f="export",
+    signature="MosaicsPeak",
+    definition=function( object, type=NA, filename=NA ) {
+        # error treatment: check invalid type
+
+        if ( is.na(type) )
+        {
+            message( "Info: 'type' is not specified by the user." )
+            message( "Info: 'type' is specified as 'txt' instead." )
+            type <- "txt"
+        }
+
+        allType <- c("txt","gff","bed")
+        invalidType <- TRUE
+        for ( i in 1:length(type) )
+        {
+            if ( length(which(!is.na(match(type[i],allType))))==0 )
+            {
+                invalidType <- FALSE
+            }
+        }
+        if ( !invalidType )
+        {
+            message( "Info: 'type' incorrect." )
+            message( "Info: 'type' is specified as 'txt' instead." )
+            type <- "txt"
+        }
+
+        # error treatment: 'filename' not specified
+
+        if ( is.na(filename) )
+        {
+            message( "Info: 'filename' is not specified by the user." )
+            message( "Info: 'filename' is specified as 'peakList' instead." )
+            filename <- paste("peakList.",type,sep="")
+        }
+
+        # export peak lists
+
+        peakList <- object@peakList
+
+        if ( nrow(object@peakList) > 0 ) {
+                message( "Info: exporting the peak list..." )
+                switch( type,
+                    "gff" = {
+                        .exportGFF( peakList=peakList, filename=filename )
+                    },
+                    "bed" = {
+                        .exportBED( peakList=peakList, filename=filename )
+                    },
+                    "txt" = {
+                        .exportTXT( peakList=peakList, filename=filename )
+                    }
+                )
+            } else {
+                # exception handling (no peak case)
+
+                message( "Info: no peak identifed. Nothing exported." )
+            }
+    }
+)
+
+setMethod(
+    f="empFDR",
+    signature="MosaicsPeak",
+    definition=function( object ) {
+        return(object@empFDR)
+    }
+)
+
+setMethod(
+    f="bdBin",
+    signature="MosaicsPeak",
+    definition=function( object ) {
+        return(object@bdBin)
+    }
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/mosaicsFit.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,328 @@
+
+# fit MOSAiCS
+
+setMethod(
+    f="mosaicsFit",
+    signature="BinData",
+    definition=function( object, analysisType="automatic", bgEst="automatic",
+        k=3, meanThres=NA, s=2, d=0.25, truncProb=0.999,
+        parallel=FALSE, nCore=8 )
+    {
+        # Note: users can tune parameters only regarding MOSAiCS model fitting.
+        # Note: tuning adaptive griding parameters is not supported yet.
+
+        # check options: parallel computing (optional)
+
+        if ( parallel == TRUE ) {
+            message( "Use 'parallel' package for parallel computing." )
+            if ( length(find.package('parallel',quiet=TRUE)) == 0 ) {
+                stop( "Please install 'parallel' package!" )
+            }
+        }
+
+        if ( analysisType == "automatic" ) {
+                # if "analysisType" is not specified, "analysisType" is determined by dataset
+
+                if ( length(object@input)==0 ) {
+                        # we don't have input: we should have both of M and GC
+
+                        if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
+                            #cat( "Info: one-sample analysis.\n" )
+                                analysisType <- "OS"
+                        } else {
+                            stop( "any of control data, mappability, or GC content does not exist. Cannot proceed!\n" )
+                        }
+                } else {
+                        # we have input: TS if we have both M & GC; IO otherwise.
+
+                        if ( length(object@mappability)>0 & length(object@gcContent)>0 ) {
+                            #cat( "Info: two-sample analysis (with mappability & GC content).\n" )
+                                analysisType <- "TS"
+                        } else {
+                            #cat( "Info: two-sample analysis (Input only).\n" )
+                                analysisType <- "IO"
+                        }
+                }
+        } else {
+                # if "analysisType" is specified, check its validity
+
+                # error treatment: Input-only analysis is impossible if input data does not exist
+
+                if ( analysisType=="IO" & length(object@input)==0 )
+                {
+                    message( "Info: two-sample analysis (Input only)." )
+                    stop( "control data does not exist. Cannot proceed!\n" )
+                }
+
+                # error treatment: If M or GC does not exist, TS analysis is not available
+
+                if ( analysisType=="OS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
+                {
+                    message( "Info: one-sample analysis." )
+                    stop( "mappability or GC content does not exist. Cannot proceed!\n" )
+                }
+
+                # error treatment: If input data does not exist, TS analysis is not available
+
+                if ( analysisType=="TS" & length(object@input)==0 )
+                {
+                    message( "Info: two-sample analysis (with mappability & GC content)." )
+                    message( "Info: control data does not exist." )
+                    message( "Info: one-sample analysis will be implemented instead." )
+                    analysisType <- "OS"
+                }
+
+                # error treatment: If M or GC does not exist, TS analysis is not available
+
+                if ( analysisType=="TS" & ( length(object@mappability)==0 | length(object@gcContent)==0 ) )
+                {
+                    message( "Info: two-sample analysis (with mappability & GC content)." )
+                    message( "Info: mappability or GC content data does not exist." )
+                    message( "Info: two-sample analysis (Input only) will be implemented instead." )
+                    analysisType <- "IO"
+                }
+        }
+
+        # check validity of "bgEst"
+
+        if ( bgEst == "automatic" ) {
+            message( "Info: background estimation method is determined based on data." )
+
+            Y_freq <- table( object@tagCount )
+
+            if ( sum(Y_freq[ as.numeric(names(Y_freq))<=2 ]) / sum(Y_freq) > 0.5 ) {
+                message( "Info: background estimation based on bins with low tag counts." )
+                bgEst <- "matchLow"
+            } else {
+                message( "Info: background estimation based on robust method of moment" )
+                bgEst <- "rMOM"
+            }
+        } else if ( bgEst == "matchLow" ) {
+            message( "Info: background estimation based on bins with low tag counts." )
+        } else if ( bgEst == "rMOM" ) {
+            message( "Info: background estimation based on robust method of moment." )
+        } else {
+            stop( "Incorrect specification for 'bgEst'! 'bgEst' should be one of 'matchLow', 'rMOM', or 'automatic'!" )
+        }
+
+        # default meanThres for each of "OS" & "TS"
+
+        if ( is.na(meanThres) )
+        {
+            switch( analysisType,
+                OS = {
+                    meanThres <- 0
+                },
+                TS = {
+                    meanThres <- 1
+                },
+                IO = {
+                    meanThres <- NA     # meanThres is not used for analysisType=="IO"
+                }
+            )
+        }
+
+        # MOSAiCS model fit
+
+        switch( analysisType,
+            OS = {
+                # one-sample analysis
+
+                message( "Info: one-sample analysis." )
+                fit <- .mosaicsFit_OS( object, bgEst=bgEst, k=k, meanThres=meanThres,
+                    parallel=parallel, nCore=nCore )
+            },
+            TS = {
+                # two-sample analysis (with M & GC)
+
+                message( "Info: two-sample analysis (with mappability & GC content)." )
+                fit <- .mosaicsFit_TS( object, bgEst=bgEst,
+                    k=k, meanThres=meanThres, s=s, d=d,
+                    parallel=parallel, nCore=nCore )
+            },
+            IO = {
+                # two-sample analysis (Input only)
+
+                message( "Info: two-sample analysis (Input only)." )
+                fit <- .mosaicsFit_IO( object, bgEst=bgEst,
+                    k=k, d=d, truncProb=truncProb,
+                    parallel=parallel, nCore=nCore )
+            }
+        )
+
+        message( "Info: done!" )
+
+        return(fit)
+    }
+)
+
+# MOSAiCS one-sample analysis
+
+.mosaicsFit_OS <- function( binData, bgEst, k=3, meanThres=0,
+    parallel=FALSE, nCore=8 )
+{
+    message( "Info: use adaptive griding." )
+    message( "Info: fitting background model..." )
+    fitParam <- .adapGridMosaicsZ0_OS(
+        Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent,
+        bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50),
+        parallel=parallel, nCore=nCore )
+    fitZ0 <- .rlmFit_OS( parEst=fitParam, mean_thres=meanThres,
+        Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent )
+    pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
+
+    rm( fitParam )
+    gc()
+    message( "Info: done!" )
+
+    Y_bd_all <- .calcYbdAll( fitZ0, k=k )
+    message( "Info: fitting one-signal-component model..." )
+    fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+    message( "Info: fitting two-signal-component model..." )
+    fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+
+    message( "Info: calculating BIC of fitted models..." )
+    fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="1S", type="BIC", npar=9 )
+    fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="2S", type="BIC", npar=12 )
+
+    mosaicsEst <- new( "MosaicsFitEst",
+        pi0=fitZ0$pi0, a=fitZ0$a,
+        betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
+        b=fitZ1_1S$b, c=fitZ1_1S$c,
+        p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
+        analysisType="OS" )
+
+    rm( fitZ0, fitZ1_1S, fitZ1_2S )
+    gc()
+
+    mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres )
+
+    new( "MosaicsFit",
+        mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
+        chrID=binData@chrID, coord=binData@coord, tagCount=binData@tagCount,
+        mappability=binData@mappability, gcContent=binData@gcContent,
+        bic1S=fitBIC_1S, bic2S=fitBIC_2S )
+}
+
+# MOSAiCS two-sample analysis (with M & GC)
+
+.mosaicsFit_TS <- function( binData, bgEst, k=3, meanThres=1, s=2, d=0.25,
+    parallel=FALSE, nCore=8 )
+{
+    message( "Info: use adaptive griding." )
+    message( "Info: fitting background model..." )
+
+    # warning if there are insufficient # of bins with 0, 1, 2 counts in control sample
+    # -> input-only analysis is preferred.
+
+    X_freq <- table(binData@input)
+    if( sum(X_freq[ as.numeric(names(X_freq))<=2 ])/sum(X_freq) < 0.5 ) {
+        message( paste("Info: insufficient # of bins with counts <=", s, "in control sample.") )
+        message( "Info: Model fit can be unstable. Input-only analysis might be preferred." )
+    }
+
+    fitParam <- .adapGridMosaicsZ0_TS(
+        Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input,
+        bgEst=bgEst, min_n_MGC=50, grids_MGC=c(0.01,0.02,0.04,0.10,0.20,0.50), min_n_X=200,
+        parallel=parallel, nCore=nCore )
+    fitZ0 <- .rlmFit_TS( parEst=fitParam, mean_thres=meanThres, s=s, d=d,
+        Y=binData@tagCount, M=binData@mappability, GC=binData@gcContent, X=binData@input )
+    pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
+
+    rm( fitParam )
+    gc()
+    message( "Info: done!" )
+
+    Y_bd_all <- .calcYbdAll( fitZ0, k=k )
+    message( "Info: fitting one-signal-component model..." )
+    fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+    message( "Info: fitting two-signal-component model..." )
+    fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+
+    message( "Info: calculating BIC of fitted models..." )
+    fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="1S", type="BIC", npar=11 )
+    fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="2S", type="BIC", npar=14 )
+
+    mosaicsEst <- new( "MosaicsFitEst",
+        pi0=fitZ0$pi0, a=fitZ0$a,
+        betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
+        b=fitZ1_1S$b, c=fitZ1_1S$c,
+        p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
+        analysisType="TS" )
+
+    rm( fitZ0, fitZ1_1S, fitZ1_2S )
+    gc()
+
+    mosaicsParam <- new( "MosaicsFitParam", k=k, meanThres=meanThres, s=s, d=d )
+
+    new( "MosaicsFit",
+        mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
+        chrID=binData@chrID, coord=binData@coord,
+        tagCount=binData@tagCount, input=binData@input,
+        mappability=binData@mappability, gcContent=binData@gcContent,
+        bic1S=fitBIC_1S, bic2S=fitBIC_2S )
+}
+
+# MOSAiCS two-sample analysis (Input only)
+
+.mosaicsFit_IO <- function( binData, bgEst, k=3, d=0.25, truncProb=0.999,
+    parallel=FALSE, nCore=8 )
+{
+    message( "Info: use adaptive griding." )
+    message( "Info: fitting background model..." )
+
+    inputTrunc <- quantile( binData@input, truncProb )
+
+    #fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
+    #    min_n_X=50 )
+    fitParam <- .adapGridMosaicsZ0_IO( Y=binData@tagCount, X=binData@input,
+        bgEst=bgEst, inputTrunc=inputTrunc, min_n_X=50,
+        parallel=parallel, nCore=nCore )
+    fitZ0 <- .rlmFit_IO( parEst=fitParam, d=d, Y=binData@tagCount,
+        X=binData@input, inputTrunc=inputTrunc )
+    pNfit <- .calcPN( Y=binData@tagCount, k=k, a=fitZ0$a, mu_est=fitZ0$muEst )
+
+    rm( fitParam )
+    gc()
+    message( "Info: done!" )
+
+    Y_bd_all <- .calcYbdAll( fitZ0, k=k )
+    message( "Info: fitting one-signal-component model..." )
+    fitZ1_1S <- .mosaicsZ1_1S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+    message( "Info: fitting two-signal-component model..." )
+    fitZ1_2S <- .mosaicsZ1_2S( fitZ0, Y=binData@tagCount,
+        pNfit=pNfit, Y_bd_all=Y_bd_all, k=k )
+
+    message( "Info: calculating BIC of fitted models..." )
+    fitBIC_1S <- .calcModelBIC( fitZ1=fitZ1_1S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="1S", type="BIC", npar=6 )
+    fitBIC_2S <- .calcModelBIC( fitZ1=fitZ1_2S, Y=binData@tagCount,
+        pNfit=pNfit, k=k, model="2S", type="BIC", npar=9 )
+
+    mosaicsEst <- new( "MosaicsFitEst",
+        pi0=fitZ0$pi0, a=fitZ0$a,
+        betaEst=fitZ0$betaEst, muEst=fitZ0$muEst, pNfit=pNfit,
+        b=fitZ1_1S$b, c=fitZ1_1S$c,
+        p1=fitZ1_2S$p1, b1=fitZ1_2S$b1, c1=fitZ1_2S$c1, b2=fitZ1_2S$b2, c2=fitZ1_2S$c2,
+        inputTrunc=inputTrunc, analysisType="IO" )
+
+    rm( fitZ0, fitZ1_1S, fitZ1_2S )
+    gc()
+
+    mosaicsParam <- new( "MosaicsFitParam", k=k, d=d )
+
+    new( "MosaicsFit",
+        mosaicsEst=mosaicsEst, mosaicsParam=mosaicsParam,
+        chrID=binData@chrID, coord=binData@coord,
+        tagCount=binData@tagCount, input=binData@input,
+        bic1S=fitBIC_1S, bic2S=fitBIC_2S )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/mosaicsPeak.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,95 @@
+
+# MOSAiCS peak calling
+
+setMethod(
+    f="mosaicsPeak",
+    signature="MosaicsFit",
+    definition=function( object, signalModel="2S", FDR=0.05,
+        binsize=NA, maxgap=200, minsize=50, thres=10 )
+    {
+        #mosaicsEst <- object@mosaicsEst
+        #tagCount <- object@tagCount
+        analysisType <- object@mosaicsEst@analysisType
+
+        # error treatment: invalid signal model specification
+
+        if ( signalModel!="1S" && signalModel!="2S" )
+        {
+            stop( "Invalid signal model: should be either '1S' or '2S'!" )
+        }
+
+        # fit marginal distribution & posterior distribution
+
+        switch( signalModel,
+            "1S" = {
+                # one signal model
+
+                pi0 <- object@mosaicsEst@pi0
+
+                message( "Info: use one-signal-component model." )
+                message( "Info: calculating posterior probabilities..." )
+                fitMD <- .margDist_1S( mosaicsEst=object@mosaicsEst,
+                    tagCount=object@tagCount, pNfit=object@mosaicsEst@pNfit, k=3 )
+                fitPH <- .getPH_1S( margDensity=fitMD, pi0=pi0 )
+            },
+            "2S" = {
+                # two signal model
+
+                pi0 <- object@mosaicsEst@pi0
+                p1 <- object@mosaicsEst@p1
+
+                message( "Info: use two-signal-component model." )
+                message( "Info: calculating posterior probabilities..." )
+                fitMD <- .margDist_2S( mosaicsEst=object@mosaicsEst,
+                    tagCount=object@tagCount, pNfit=object@mosaicsEst@pNfit, k=3 )
+                fitPH <- .getPH_2S( margDensity=fitMD, pi0=pi0, p1=p1 )
+            }
+        )
+
+        rm(fitMD)
+        gc()
+
+        # peak calling
+
+        message( "Info: calling peaks..." )
+        switch( analysisType,
+            OS = {
+                dataSet <- list( chrID=object@chrID, coord=object@coord, Y=object@tagCount,
+                    M=object@mappability, GC=object@gcContent )
+            },
+            TS = {
+                dataSet <- list( chrID=object@chrID, coord=object@coord, Y=object@tagCount,
+                    X=object@input, M=object@mappability, GC=object@gcContent )
+            },
+            IO = {
+                dataSet <- list( chrID=object@chrID, coord=object@coord, Y=object@tagCount,
+                    X=object@input )
+            }
+        )
+        fitPeak <- .peakCall( postProb=fitPH, dataSet=dataSet,
+            FDR=FDR, binsize=binsize, maxgap=maxgap, minsize=minsize, thres=thres,
+            analysisType=analysisType )
+        peakSet <- fitPeak$peakSet
+
+        rm( fitPH, dataSet )
+        gc()
+
+        message( "Info: done!" )
+
+        # construct "MosaicsPeak" class object
+
+        peakParam <- new( "MosaicsPeakParam",
+            analysisType=(object@mosaicsEst)@analysisType, signalModel=signalModel,
+            FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+
+        if ( !is.null(peakSet) ) {
+            peakList <- fitPeak$peakSet
+        } else {
+            # exception handling (no peak case)
+
+            peakList <- data.frame()
+        }
+        new( "MosaicsPeak",
+            peakList=peakList, peakParam=peakParam, bdBin=fitPeak$bdBin, empFDR=fitPeak$empFDR )
+    }
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/mosaicsRunAll.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,461 @@
+mosaicsRunAll <- function(
+    chipFile=NULL, chipFileFormat=NULL,
+    controlFile=NULL, controlFileFormat=NULL,
+    binfileDir=NULL,
+    peakFile=NULL, peakFileFormat=NULL,
+    reportSummary=FALSE, summaryFile=NULL,
+    reportExploratory=FALSE, exploratoryFile=NULL,
+    reportGOF=FALSE, gofFile=NULL,
+    PET=FALSE, byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    FDR=0.05, fragLen=200, binSize=200, capping=0, bgEst="automatic", d=0.25,
+    signalModel="BIC", maxgap=200, minsize=50, thres=10, parallel=FALSE, nCore=8 ) {
+
+    analysisType <- "IO"
+
+    # check options: input & output (required)
+
+    if ( is.null(chipFile) ) { stop( "Please specify 'chipFile'!" ) }
+    if ( is.null(chipFileFormat) ) { stop( "Please specify 'chipFileFormat'!" ) }
+
+    if ( is.null(controlFile) ) { stop( "Please specify 'controlFile'!" ) }
+    if ( is.null(controlFileFormat) ) { stop( "Please specify 'controlFileFormat'!" ) }
+
+    if ( is.null(peakFile) ) { stop( "Please specify 'peakFile'!" ) }
+    if ( is.null(peakFileFormat) ) { stop( "Please specify 'peakFileFormat'!" ) }
+
+    if ( is.null(binfileDir) ) { stop( "Please specify 'binfileDir'!" ) }
+
+    # check options: peak list
+
+    if ( length(peakFile) != length(peakFileFormat) ) {
+        stop( "Lengths of 'peakFileName' and 'peakFileFormat' should be same!" )
+    }
+
+    # check options: reports (optional)
+
+    if ( reportSummary ) {
+        if ( is.null(summaryFile) ) {
+            stop( "Please specify 'summaryFile'!" )
+        }
+    }
+
+    if ( reportGOF ) {
+        if ( is.null(gofFile) ) {
+            stop( "Please specify 'gofFile'!" )
+        }
+    }
+
+    if ( reportExploratory ) {
+        if ( is.null(exploratoryFile) ) {
+            stop( "Please specify 'exploratoryFile'!" )
+        }
+    }
+
+    # check options: parallel computing (optional)
+
+    if ( parallel == TRUE ) {
+        message( "Use 'parallel' package for parallel computing." )
+        if ( length(find.package('parallel',quiet=TRUE)) == 0 ) {
+            stop( "Please install 'parallel' package!" )
+        }
+    }
+
+    # construction of bin-level files
+
+    cat( "Info: constructing bin-level files...\n" )
+
+    processSet <- list()
+    processSet[[1]] <- c( chipFile, chipFileFormat )
+    processSet[[2]] <- c( controlFile, controlFileFormat )
+
+
+    if ( parallel == TRUE ) {
+        # if "parallel" package exists, utilize parallel computing with "mclapply"
+
+        mclapply( processSet, function(x) {
+            constructBins(
+                infile = x[1], fileFormat = x[2], outfileLoc = binfileDir,
+                byChr = byChr, useChrfile = useChrfile, chrfile = chrfile, excludeChr = excludeChr,
+                PET = PET, fragLen = fragLen, binSize = binSize, capping = capping )
+        }, mc.cores=nCore )
+    } else {
+        # otherwise, use usual "lapply"
+
+        lapply( processSet, function(x) {
+            constructBins(
+                infile = x[1], fileFormat = x[2], outfileLoc = binfileDir,
+                byChr = byChr, useChrfile = useChrfile, chrfile = chrfile, excludeChr = excludeChr,
+                PET = PET, fragLen = fragLen, binSize = binSize, capping = capping )
+        } )
+    }
+
+    if ( byChr ) {
+        ###############################################################
+        #                                                             #
+        #                chromosome-wise analysis                     #
+        #                                                             #
+        ###############################################################
+
+        # read in bin-level files
+
+        cat( "Info: analyzing bin-level files...\n" )
+
+        setwd( binfileDir )
+        if ( PET == TRUE ) {
+            list_chip <- list.files( path=binfileDir,
+                paste(basename(chipFile),"_bin",binSize,"_.*.txt",sep="")  )
+            list_control <- list.files( path=binfileDir,
+                paste(basename(controlFile),"_bin",binSize,"_.*.txt",sep="")  )
+        } else {
+            list_chip <- list.files( path=binfileDir,
+                paste(basename(chipFile),"_fragL",fragLen,"_bin",binSize,"_.*.txt",sep="")  )
+            list_control <- list.files( path=binfileDir,
+                paste(basename(controlFile),"_fragL",fragLen,"_bin",binSize,"_.*.txt",sep="")  )
+        }
+
+        # check list of chromosomes & analyze only chromosomes
+        # that bin-level files for both chip & control exist
+
+        #chrID_chip <- unlist( lapply( strsplit( list_chip, paste("_",basename(chipFile),sep="") ),
+        #    function(x) x[1] ) )
+        #chrID_control <- unlist( lapply( strsplit( list_control, paste("_",controlFile,sep="") ),
+        #    function(x) x[1] ) )
+        chrID_chip <- unlist( lapply( list_chip, function(x) {
+            splitvec <- strsplit( x, "_" )[[1]]
+            IDtxt <- splitvec[ length(splitvec) ]
+            return( strsplit( IDtxt, ".txt" )[[1]][1] )
+        } ) )
+        chrID_control <- unlist( lapply( list_control, function(x) {
+            splitvec <- strsplit( x, "_" )[[1]]
+            IDtxt <- splitvec[ length(splitvec) ]
+            return( strsplit( IDtxt, ".txt" )[[1]][1] )
+        } ) )
+        index_chip <- which( !is.na( match( chrID_chip, chrID_control ) ) )
+        index_control <- match( chrID_chip, chrID_control )
+        index_list <- list()
+        for ( i in 1:length(index_chip) ) {
+            index_list[[i]] <- c( index_chip[i], index_control[i] )
+        }
+
+        # model fitting & peak calling
+        # check whether rparallel is available. if so, use it.
+
+        cat( "Info: fitting MOSAiCS model & call peaks...\n" )
+
+        if ( length(index_chip) < nCore ) {
+            nCore <- length(index_chip)
+        }
+
+        if ( parallel == TRUE ) {
+            # if "parallel" package exists, utilize parallel computing with "mclapply"
+
+            out <- mclapply( index_list, function(x) {
+                # read in bin-level file
+
+                chip_file <- list_chip[ x[1] ]
+                input_file <- list_control[ x[2] ]
+                bin <- readBins(
+                    type=c("chip","input"), fileName=c(chip_file,input_file),
+                    parallel=parallel, nCore=nCore )
+
+                # fit model
+
+                fit <- mosaicsFit( bin, analysisType=analysisType, bgEst=bgEst, d=d,
+                    parallel=parallel, nCore=nCore )
+
+                # call peaks
+
+                if ( signalModel=="BIC" ) {
+                    # if not specified, use BIC
+
+                    if ( fit@bic1S < fit@bic2S ) {
+                        peak <- mosaicsPeak( fit, signalModel="1S",
+                            FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                        opt_sig_model <- "One-signal-component model"
+                    } else {
+                        peak <- mosaicsPeak( fit, signalModel="2S",
+                            FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                        opt_sig_model <- "Two-signal-component model"
+                    }
+                } else {
+                    peak <- mosaicsPeak( fit, signalModel=signalModel,
+                        FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                    if ( signalModel=="1S" ) {
+                        opt_sig_model <- "One-signal-component model"
+                    } else {
+                        opt_sig_model <- "Two-signal-component model"
+                    }
+                }
+
+                # keep results
+
+                peakPrint <- print(peak)
+
+                return( list( chrID=as.character( chrID_chip[ x[1] ] ),
+                    bin=bin, fit=fit, peak=peak, peakPrint=peakPrint,
+                    n_peaks=nrow(peak@peakList), peak_width=median(peak@peakList$peakSize),
+                    opt_sig_model=opt_sig_model ) )
+            }, mc.cores=nCore )
+        } else {
+            # otherwise, use usual "lapply"
+
+            out <- lapply( index_list, function(x) {
+                # read in bin-level file
+
+                chip_file <- list_chip[ x[1] ]
+                input_file <- list_control[ x[2] ]
+                bin <- readBins(
+                    type=c("chip","input"), fileName=c(chip_file,input_file),
+                    parallel=parallel, nCore=nCore )
+
+                # fit model
+
+                fit <- mosaicsFit( bin, analysisType=analysisType, bgEst=bgEst, d=d,
+                    parallel=parallel, nCore=nCore )
+
+                # call peaks
+
+                if ( signalModel=="BIC" ) {
+                    # if not specified, use BIC
+
+                    if ( fit@bic1S < fit@bic2S ) {
+                        peak <- mosaicsPeak( fit, signalModel="1S",
+                            FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                        opt_sig_model <- "One-signal-component model"
+                    } else {
+                        peak <- mosaicsPeak( fit, signalModel="2S",
+                            FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                        opt_sig_model <- "Two-signal-component model"
+                    }
+                } else {
+                    peak <- mosaicsPeak( fit, signalModel=signalModel,
+                        FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                    if ( signalModel=="1S" ) {
+                        opt_sig_model <- "One-signal-component model"
+                    } else {
+                        opt_sig_model <- "Two-signal-component model"
+                    }
+                }
+
+                # keep results
+
+                peakPrint <- print(peak)
+
+                return( list( chrID=as.character( chrID_chip[ x[1] ] ),
+                    bin=bin, fit=fit, peak=peak, peakPrint=peakPrint,
+                    n_peaks=nrow(peak@peakList), peak_width=median(peak@peakList$peakSize),
+                    opt_sig_model=opt_sig_model ) )
+            } )
+        }
+
+        # summarize results
+
+        peakSetFinal <- c()
+        for ( i in 1:length(out) ) {
+            peakSetFinal <- rbind( peakSetFinal, out[[i]]$peakPrint )
+        }
+
+        resultList <- list()
+        resultList$chrID <- resultList$n_peaks <-
+            resultList$peak_width <- resultList$opt_sig_model <- rep( NA, length(out) )
+        for ( i in 1:length(out) ) {
+            resultList$chrID[i] <- out[[i]]$chrID
+            resultList$n_peaks[i] <- out[[i]]$n_peaks
+            resultList$peak_width[i] <- out[[i]]$peak_width
+            resultList$opt_sig_model[i] <- out[[i]]$opt_sig_model
+        }
+    } else {
+        ###############################################################
+        #                                                             #
+        #                   genome-wide analysis                      #
+        #                                                             #
+        ###############################################################
+
+        # read in bin-level files
+
+        cat( "Info: analyzing bin-level files...\n" )
+
+        setwd( binfileDir )
+        if ( PET == TRUE ) {
+            chip_file <- list.files( path=binfileDir,
+                paste(basename(chipFile),"_bin",binSize,".txt",sep="")  )
+            input_file <- list.files( path=binfileDir,
+                paste(basename(controlFile),"_bin",binSize,".txt",sep="")  )
+        } else {
+            chip_file <- list.files( path=binfileDir,
+                paste(basename(chipFile),"_fragL",fragLen,"_bin",binSize,".txt",sep="")  )
+            input_file <- list.files( path=binfileDir,
+                paste(basename(controlFile),"_fragL",fragLen,"_bin",binSize,".txt",sep="")  )
+        }
+
+        # model fitting & peak calling
+        # check whether rparallel is available. if so, use it.
+
+        cat( "Info: fitting MOSAiCS model & call peaks...\n" )
+
+        out <- list()
+
+        # read in bin-level file
+
+        out$bin <- readBins( type=c("chip","input"), fileName=c(chip_file,input_file),
+            parallel=parallel, nCore=nCore )
+
+        # fit model
+
+        out$fit <- mosaicsFit( out$bin, analysisType=analysisType, bgEst=bgEst, d=d,
+            parallel=parallel, nCore=nCore )
+
+        # call peaks
+
+        if ( signalModel=="BIC" ) {
+            # if not specified, use BIC
+
+            if ( out$fit@bic1S < out$fit@bic2S ) {
+                out$peak <- mosaicsPeak( out$fit, signalModel="1S",
+                    FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                out$opt_sig_model <- "One-signal-component model"
+            } else {
+                out$peak <- mosaicsPeak( out$fit, signalModel="2S",
+                    FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+                out$opt_sig_model <- "Two-signal-component model"
+            }
+        } else {
+            out$peak <- mosaicsPeak( out$fit, signalModel=signalModel,
+                FDR=FDR, maxgap=maxgap, minsize=minsize, thres=thres )
+            if ( signalModel=="1S" ) {
+                out$opt_sig_model <- "One-signal-component model"
+            } else {
+                out$opt_sig_model <- "Two-signal-component model"
+            }
+        }
+
+        # keep results
+
+        peakSetFinal <- print(out$peak)
+
+        peakPrint <- split( peakSetFinal, peakSetFinal$chrID )
+        out$chrID <- names(peakPrint)
+        out$n_peaks <- unlist( lapply( peakPrint, nrow ) )
+        out$peak_width <- unlist( lapply( peakPrint, function(x) { median(x$peakSize) } ) )
+
+        resultList <- list()
+        resultList$chrID <- out$chrID
+        resultList$n_peaks <- out$n_peaks
+        resultList$peak_width <- out$peak_width
+        resultList$opt_sig_model <- rep( out$opt_sig_model, length(resultList$chrID) )
+    }
+
+    # write peak calling results
+
+    cat( "Info: writing the peak list...\n" )
+
+    for ( ff in 1:length(peakFileFormat) ) {
+        if ( peakFileFormat[ff] == "txt" ) {
+            .exportTXT( peakList=peakSetFinal, filename=peakFile[ff] )
+        } else if ( peakFileFormat[ff] == "bed" ) {
+            .exportBED( peakList=peakSetFinal, filename=peakFile[ff] )
+        } else if ( peakFileFormat[ff] == "gff" ) {
+            .exportGFF( peakList=peakSetFinal, filename=peakFile[ff] )
+        } else {
+            stop( "Inappropriate peak file format!" )
+        }
+    }
+
+    # report: summary
+
+    cat( "Info: generating reports...\n" )
+
+    if ( reportSummary ) {
+        .reportSummary( summaryFile=summaryFile, resultList=resultList,
+            chipFile=chipFile, chipFileFormat=chipFileFormat,
+            controlFile=controlFile, controlFileFormat=controlFileFormat,
+            binfileDir=binfileDir,
+            peakFile=peakFile, peakFileFormat=peakFileFormat,
+            byChr=byChr, FDR=FDR, fragLen=fragLen, binSize=binSize, capping=capping,
+            analysisType=analysisType, d=d,
+            signalModel=signalModel, maxgap=maxgap, minsize=minsize, thres=thres )
+    }
+
+    # GOF
+
+    if ( reportGOF ) {
+        pdf(gofFile)
+
+        if ( byChr ) {
+            for ( i in 1:length(out) ) {
+                chrID <- out[[i]]$chrID
+                fit <- out[[i]]$fit
+
+                # chrID
+
+                plot( 0, 0, type="n", axes=F, ann=F )
+                text( 0, 0, chrID, cex=4 )
+
+                # GOF
+
+                plot(fit)
+            }
+        } else {
+            fit <- out$fit
+            plot(fit)
+        }
+
+        dev.off()
+    }
+
+    # exploratory analysis
+
+    if ( reportExploratory ) {
+        pdf(exploratoryFile)
+
+        if ( byChr ) {
+            for ( i in 1:length(out) ) {
+                chrID <- out[[i]]$chrID
+                bin <- out[[i]]$bin
+
+                # chrID
+
+                plot( 0, 0, type="n", axes=F, ann=F )
+                text( 0, 0, chrID, cex=4 )
+
+                # exploratory plots
+
+                plot( bin )
+                if ( analysisType=="IO" ) {
+                    plot( bin, plotType="input" )
+                }
+                if ( analysisType=="OS" ) {
+                    plot( bin, plotType="M" )
+                    plot( bin, plotType="GC" )
+                }
+                if ( analysisType=="TS" ) {
+                    plot( bin, plotType="M" )
+                    plot( bin, plotType="GC" )
+                    plot( bin, plotType="M|input" )
+                    plot( bin, plotType="GC|input" )
+                }
+            }
+        } else {
+            bin <- out$bin
+
+            # exploratory plots
+
+            plot( bin )
+            if ( analysisType=="IO" ) {
+                plot( bin, plotType="input" )
+            }
+            if ( analysisType=="OS" ) {
+                plot( bin, plotType="M" )
+                plot( bin, plotType="GC" )
+            }
+            if ( analysisType=="TS" ) {
+                plot( bin, plotType="M" )
+                plot( bin, plotType="GC" )
+                plot( bin, plotType="M|input" )
+                plot( bin, plotType="GC|input" )
+            }
+        }
+
+        dev.off()
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/R/readBins.R	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,819 @@
+
+# read bin-level data & process it
+
+readBins <- function( type=c("chip","input"), fileName=NULL,
+    dataType='unique', rounding=100, parallel=FALSE, nCore=8 )
+{
+    # [Note] Assumption: chip, input, M, GC, & N have corresponding coordinates
+
+    # error treatment
+
+    .validType( type )
+    .validLocation( type=type, fileName=fileName )
+    .validDataType( dataType )
+
+    # check options: parallel computing (optional)
+
+    if ( parallel == TRUE ) {
+        message( "Use 'parallel' package for parallel computing." )
+        if ( length(find.package('parallel',quiet=TRUE)) == 0 ) {
+            stop( "Please install 'parallel' package!" )
+        }
+    }
+
+    # existence
+
+    existInput <- ( length(which(type=="input")) > 0 )
+    existM <- ( length(which(type=="M")) > 0 )
+    existGC <- ( length(which(type=="GC")) > 0 )
+    existN <- ( length(which(type=="N")) > 0 )
+
+    # read bin-level data
+
+    message( "Info: reading and preprocessing bin-level data..." )
+
+    chipFileName <- fileName[ type=="chip" ]
+    chip <- read.table( chipFileName, header=FALSE, sep='\t',
+        colClasses=c("character","numeric","numeric"), comment.char="",
+        stringsAsFactors=FALSE, check.names=FALSE )
+
+    if ( existM )
+    {
+        mapScoreFileName <- fileName[ type=="M" ]
+        mapScore <- read.table( mapScoreFileName, header=FALSE,
+            stringsAsFactors=FALSE, check.names=FALSE, comment.char="" )
+    }
+    if ( existGC ) {
+        gcScoreFileName <- fileName[ type=="GC" ]
+        gcScore <- read.table( gcScoreFileName, header=FALSE,
+            stringsAsFactors=FALSE, check.names=FALSE, comment.char="" )
+    }
+    if ( existN ) {
+        nNucFileName <- fileName[ type=="N" ]
+        nNuc <- read.table( nNucFileName, header=FALSE,
+            stringsAsFactors=FALSE, check.names=FALSE, comment.char="" )
+    }
+    if ( existInput )
+    {
+        inputFileName <- fileName[ type=="input" ]
+        input <- read.table( inputFileName, header=FALSE, sep='\t',
+            colClasses=c("character","numeric","numeric"), comment.char="",
+            stringsAsFactors=FALSE, check.names=FALSE )
+    }
+
+    # error treatment
+
+    if ( existInput )
+    {
+        if ( chipFileName == inputFileName ) {
+            warning( "the same data was used for both ChIP & Input." )
+            warning( "parameters cannot be properly estimated in this case." )
+            warning( "remember that you will get errors in 'mosaicsFit' method!" )
+        }
+    }
+
+    # process bin-level data
+
+    chrChIP <- unique(chip[,1])
+    chrCommon <- chrChIP
+
+    if ( existInput )
+    {
+        if ( existM & existGC & existN )
+        {
+            ####################################################################
+            #                                                                  #
+            #     two-sample analysis (if M, GC, N are available)              #
+            #                                                                  #
+            ####################################################################
+
+            if ( length(chrChIP) > 1 ) {
+                # genome-wide analysis
+
+                message( "Info: assume that data contains more than one chromosome" )
+
+                if ( ncol(mapScore) == 3 & ncol(gcScore) == 3 & ncol(nNuc) == 3 ) {
+                    # split data by chromosome
+
+                    chipByChr <- split( chip[,2:3], chip[,1] )
+                    inputByChr <- split( input[,2:3], input[,1] )
+                    mapByChr <- split( mapScore[,2:3], mapScore[,1] )
+                    gcByChr <- split( gcScore[,2:3], gcScore[,1] )
+                    nByChr <- split( nNuc[,2:3], nNuc[,1] )
+
+                    # extract list of shared chromosomes
+
+                    chrInput <- unique(input[,1])
+                    chrM <- unique(mapScore[,1])
+                    chrGC <- unique(gcScore[,1])
+                    chrN <- unique(nNuc[,1])
+
+                    chrCommon <- Reduce( intersect,
+                        list( chrChIP, chrInput, chrM, chrGC, chrN ) )
+                    chrCommon <- sort(chrCommon)
+
+                    # provide error & stop if there is no common chromosome
+
+                    if ( length(chrCommon) == 0 ) {
+                        stop( "No chromosome is common among ChIP, control, mappability, GC, and N files." )
+                    }
+
+                    # construct data to process
+
+                    coordMat <- matrix( NA, length(chrCommon), 4 )
+
+                    dataList <- list()
+                    for ( i in 1:length(chrCommon) ) {
+                        dataList[[i]] <- list()
+                        dataList[[i]]$chip <- chipByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$input <- inputByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$mapScore <- mapByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$gcScore <- gcByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$nNuc <- nByChr[[ chrCommon[i] ]]
+
+                        coordMat[i,1] <- min(chipByChr[[ chrCommon[i] ]][,1])
+                        coordMat[i,2] <- max(chipByChr[[ chrCommon[i] ]][,1])
+                    }
+
+                    rm( chipByChr, inputByChr, mapByChr, gcByChr, nByChr )
+                    rm( chrInput, chrM, chrGC, chrN )
+                    gc()
+
+                    # processing (using parallel computing, if multicore exists)
+
+                    if ( parallel == TRUE ) {
+                        # if "multicore" package exists, utilize parallel computing with "mclapply"
+                        #require(multicore)
+
+                        out <- mclapply( dataList, function(x) {
+                            .processBin_MGCX( chip=x$chip, input=x$input,
+                                mapScore=x$mapScore, gcScore=x$gcScore, nNuc=x$nNuc,
+                                dataType=dataType, rounding=rounding )
+                            }, mc.cores = nCore )
+                    } else {
+                        # otherwise, use usual "lapply"
+
+                        out <- lapply( dataList, function(x) {
+                            .processBin_MGCX( chip=x$chip, input=x$input,
+                                mapScore=x$mapScore, gcScore=x$gcScore, nNuc=x$nNuc,
+                                dataType=dataType, rounding=rounding )
+                            } )
+                    }
+
+                    rm( dataList )
+                    gc()
+
+                    # stack processed data
+
+                    chrID <- coord <- Y <- X <- M <- GC <- c()
+
+                    for ( i in 1:length(chrCommon) ) {
+                        chrID <- c( chrID, rep( chrCommon[i], length(out[[i]]$coord) ) )
+                        coord <- c( coord, out[[i]]$coord )
+                        Y <- c( Y, out[[i]]$Y )
+                        X <- c( X, out[[i]]$X )
+                        M <- c( M, out[[i]]$M )
+                        GC <- c( GC, out[[i]]$GC )
+
+                        coordMat[i,3] <- min(out[[i]]$coord)
+                        coordMat[i,4] <- max(out[[i]]$coord)
+                    }
+
+                    rm( out )
+                    gc()
+
+                    outBin <- new( "BinData", chrID=chrID, coord=coord, tagCount=Y, input=X,
+                        mappability=M, gcContent=GC, dataType=dataType )
+
+                    rm( chrID, coord, Y, X, M, GC )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            } else {
+                message( "Info: assume that data contains only one chromosome" )
+
+                if ( ncol(mapScore) == 2 & ncol(gcScore) == 2 & ncol(nNuc) == 2 ) {
+                    # processing
+
+                    out <- .processBin_MGCX( chip=chip[,2:3], input=input[,2:3],
+                        mapScore=mapScore, gcScore=gcScore, nNuc=nNuc,
+                        dataType=dataType, rounding=rounding )
+
+                    chrID <- rep( chip[1,1], length(out$coord) )
+                    outBin <- new( "BinData", chrID=chrID,
+                        coord=out$coord, tagCount=out$Y, input=out$X,
+                        mappability=out$M, gcContent=out$GC, dataType=dataType )
+
+                    rm( chrID, out )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            }
+        } else if ( existN ) {
+            ####################################################################
+            #                                                                  #
+            #          two-sample analysis (input only, with N)                #
+            #                                                                  #
+            ####################################################################
+
+            if ( length(chrChIP) > 1 ) {
+                # genome-wide analysis
+
+                message( "Info: data contains more than one chromosome" )
+
+                if ( ncol(nNuc) == 3 ) {
+                    # split data by chromosome
+
+                    chipByChr <- split( chip[,2:3], chip[,1] )
+                    inputByChr <- split( input[,2:3], input[,1] )
+                    nByChr <- split( nNuc[,2:3], nNuc[,1] )
+
+                    # extract list of shared chromosomes
+
+                    chrInput <- unique(input[,1])
+                    chrN <- unique(nNuc[,1])
+                    chrCommon <- intersect( chrChIP, chrInput, chrN )
+                    chrCommon <- sort(chrCommon)
+
+                    # provide error & stop if there is no common chromosome
+
+                    if ( length(chrCommon) == 0 ) {
+                        stop( "No chromosome is common among ChIP, control, and N files." )
+                    }
+
+                    # construct data to process
+
+                    coordMat <- matrix( NA, length(chrCommon), 4 )
+
+                    dataList <- list()
+                    for ( i in 1:length(chrCommon) ) {
+                        dataList[[i]] <- list()
+                        dataList[[i]]$chip <- chipByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$input <- inputByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$nNuc <- nByChr[[ chrCommon[i] ]]
+
+                        coordMat[i,1] <- min(chipByChr[[ chrCommon[i] ]][,1])
+                        coordMat[i,2] <- max(chipByChr[[ chrCommon[i] ]][,1])
+                    }
+
+                    rm( chipByChr, inputByChr, nByChr )
+                    rm( chrInput, chrN )
+                    gc()
+
+                    # processing (using parallel computing, if multicore exists)
+
+                    if ( parallel == TRUE ) {
+                        # if "multicore" package exists, utilize parallel computing with "mclapply"
+                        #require(multicore)
+
+                        out <- mclapply( dataList, function(x) {
+                            .processBin_XN( chip=x$chip, input=x$input, nNuc=x$nNuc,
+                                dataType=dataType )
+                            }, mc.cores = nCore )
+                    } else {
+                        # otherwise, use usual "lapply"
+
+                        out <- lapply( dataList, function(x) {
+                            .processBin_XN( chip=x$chip, input=x$input, nNuc=x$nNuc,
+                                dataType=dataType )
+                            } )
+                    }
+
+                    rm( dataList )
+                    gc()
+
+                    # stack processed data
+
+                    chrID <- coord <- Y <- X <- c()
+
+                    for ( i in 1:length(chrCommon) ) {
+                        chrID <- c( chrID, rep( chrCommon[i], length(out[[i]]$coord) ) )
+                        coord <- c( coord, out[[i]]$coord )
+                        Y <- c( Y, out[[i]]$Y )
+                        X <- c( X, out[[i]]$X )
+
+                        coordMat[i,3] <- min(out[[i]]$coord)
+                        coordMat[i,4] <- max(out[[i]]$coord)
+                    }
+
+                    rm( out )
+                    gc()
+
+                    outBin <- new( "BinData", chrID=chrID, coord=coord, tagCount=Y, input=X,
+                        dataType=dataType )
+
+                    rm( chrID, coord, Y, X )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            } else {
+                message( "Info: data contains only one chromosome" )
+
+                if ( ncol(nNuc) == 2 ) {
+                    # processing
+
+                    out <- .processBin_XN( chip=chip[,2:3], input=input[,2:3], nNuc=nNuc,
+                        dataType=dataType )
+
+                    chrID <- rep( chip[1,1], length(out$coord) )
+                    outBin <- new( "BinData", chrID=chrID, coord=out$coord, tagCount=out$Y,
+                        input=out$X, dataType=dataType )
+
+                    rm( chrID, out )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            }
+        } else
+        {
+            ####################################################################
+            #                                                                  #
+            #            two-sample analysis (input only, without N)           #
+            #                                                                  #
+            ####################################################################
+
+            if ( length(chrChIP) > 1 ) {
+                # genome-wide analysis
+
+                message( "Info: data contains more than one chromosome." )
+
+                # split data by chromosome
+
+                chipByChr <- split( chip[,2:3], chip[,1] )
+                inputByChr <- split( input[,2:3], input[,1] )
+
+                # extract list of shared chromosomes
+
+                chrInput <- unique(input[,1])
+                chrCommon <- intersect( chrChIP, chrInput )
+                chrCommon <- sort(chrCommon)
+
+                # provide error & stop if there is no common chromosome
+
+                if ( length(chrCommon) == 0 ) {
+                    stop( "No chromosome is common between ChIP and control files." )
+                }
+
+                # construct data to process
+
+                coordMat <- matrix( NA, length(chrCommon), 4 )
+
+                dataList <- list()
+                for ( i in 1:length(chrCommon) ) {
+                    dataList[[i]] <- list()
+                    dataList[[i]]$chip <- chipByChr[[ chrCommon[i] ]]
+                    dataList[[i]]$input <- inputByChr[[ chrCommon[i] ]]
+
+                    coordMat[i,1] <- min(chipByChr[[ chrCommon[i] ]][,1])
+                    coordMat[i,2] <- max(chipByChr[[ chrCommon[i] ]][,1])
+                }
+
+                rm( chipByChr, inputByChr )
+                rm( chrInput )
+                gc()
+
+                # processing (using parallel computing, if multicore exists)
+
+                if ( parallel == TRUE ) {
+                    # if "multicore" package exists, utilize parallel computing with "mclapply"
+                    #require(multicore)
+
+                    out <- mclapply( dataList, function(x) {
+                        .processBin_X( chip=x$chip, input=x$input, dataType=dataType )
+                        }, mc.cores = nCore )
+                } else {
+                    # otherwise, use usual "lapply"
+
+                    out <- lapply( dataList, function(x) {
+                        .processBin_X( chip=x$chip, input=x$input, dataType=dataType )
+                        } )
+                }
+
+                rm( dataList )
+                gc()
+
+                # stack processed data
+
+                chrID <- coord <- Y <- X <- c()
+
+                for ( i in 1:length(chrCommon) ) {
+                    chrID <- c( chrID, rep( chrCommon[i], length(out[[i]]$coord) ) )
+                    coord <- c( coord, out[[i]]$coord )
+                    Y <- c( Y, out[[i]]$Y )
+                    X <- c( X, out[[i]]$X )
+
+                    coordMat[i,3] <- min(out[[i]]$coord)
+                    coordMat[i,4] <- max(out[[i]]$coord)
+                }
+
+                rm( out )
+                gc()
+
+                outBin <- new( "BinData", chrID=chrID, coord=coord, tagCount=Y, input=X,
+                    dataType=dataType )
+
+                rm( chrID, coord, Y, X )
+                gc()
+            } else {
+                message( "Info: data contains only one chromosome." )
+
+                out <- .processBin_X( chip=chip[,2:3], input=input[,2:3], dataType=dataType )
+
+                chrID <- rep( chip[1,1], length(out$coord) )
+                outBin <- new( "BinData", chrID=chrID, coord=out$coord, tagCount=out$Y,
+                    input=out$X, dataType=dataType )
+
+                rm( chrID, out )
+                gc()
+            }
+        }
+    } else
+    {
+        ####################################################################
+        #                                                                  #
+        #                      one-sample analysis                         #
+        #                                                                  #
+        ####################################################################
+
+        if ( existM & existGC & existN ) {
+
+            if ( length(chrChIP) > 1 ) {
+                # genome-wide analysis
+
+                message( "Info: data contains more than one chromosome." )
+
+                if ( ncol(mapScore) == 3 & ncol(gcScore) == 3 & ncol(nNuc) == 3 ) {
+                    # split data by chromosome
+
+                    chipByChr <- split( chip[,2:3], chip[,1] )
+                    mapByChr <- split( mapScore[,2:3], mapScore[,1] )
+                    gcByChr <- split( gcScore[,2:3], gcScore[,1] )
+                    nByChr <- split( nNuc[,2:3], nNuc[,1] )
+
+                    # extract list of shared chromosomes
+
+                    chrM <- unique(mapScore[,1])
+                    chrGC <- unique(gcScore[,1])
+                    chrN <- unique(nNuc[,1])
+
+                    chrCommon <- Reduce( intersect,
+                        list( chrChIP, chrM, chrGC, chrN ) )
+                    chrCommon <- sort(chrCommon)
+
+                    # provide error & stop if there is no common chromosome
+
+                    if ( length(chrCommon) == 0 ) {
+                        stop( "No chromosome is common among ChIP, mappability, GC, and N files." )
+                    }
+
+                    # construct data to process
+
+                    coordMat <- matrix( NA, length(chrCommon), 4 )
+
+                    dataList <- list()
+                    for ( i in 1:length(chrCommon) ) {
+                        dataList[[i]] <- list()
+                        dataList[[i]]$chip <- chipByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$mapScore <- mapByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$gcScore <- gcByChr[[ chrCommon[i] ]]
+                        dataList[[i]]$nNuc <- nByChr[[ chrCommon[i] ]]
+
+                        coordMat[i,1] <- min(chipByChr[[ chrCommon[i] ]][,1])
+                        coordMat[i,2] <- max(chipByChr[[ chrCommon[i] ]][,1])
+                    }
+
+                    rm( chipByChr, mapByChr, gcByChr, nByChr )
+                    rm( chrM, chrGC, chrN )
+                    gc()
+
+                    # processing (using parallel computing, if multicore exists)
+
+                    if ( parallel == TRUE ) {
+                        # if "multicore" package exists, utilize parallel computing with "mclapply"
+                        #require(multicore)
+
+                        out <- mclapply( dataList, function(x) {
+                            .processBin_MGC( chip=x$chip,
+                                mapScore=x$mapScore, gcScore=x$gcScore, nNuc=x$nNuc,
+                                dataType=dataType, rounding=rounding )
+                            }, mc.cores = nCore )
+                    } else {
+                        # otherwise, use usual "lapply"
+
+                        out <- lapply( dataList, function(x) {
+                            .processBin_MGC( chip=x$chip,
+                                mapScore=x$mapScore, gcScore=x$gcScore, nNuc=x$nNuc,
+                                dataType=dataType, rounding=rounding )
+                            } )
+                    }
+
+                    rm( dataList )
+                    gc()
+
+                    # stack processed data
+
+                    chrID <- coord <- Y <- M <- GC <- c()
+
+                    for ( i in 1:length(chrCommon) ) {
+                        chrID <- c( chrID, rep( chrCommon[i], length(out[[i]]$coord) ) )
+                        coord <- c( coord, out[[i]]$coord )
+                        Y <- c( Y, out[[i]]$Y )
+                        M <- c( M, out[[i]]$M )
+                        GC <- c( GC, out[[i]]$GC )
+
+                        coordMat[i,3] <- min(out[[i]]$coord)
+                        coordMat[i,4] <- max(out[[i]]$coord)
+                    }
+
+                    rm( out )
+                    gc()
+
+                    outBin <- new( "BinData", chrID=chrID, coord=coord, tagCount=Y,
+                        mappability=M, gcContent=GC, dataType=dataType )
+
+                    rm( chrID, coord, Y, M, GC )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            } else {
+                message( "Info: data contains only one chromosome." )
+
+                if ( ncol(mapScore) == 2 & ncol(gcScore) == 2 & ncol(nNuc) == 2 ) {
+                    # processing
+
+                    out <- .processBin_MGC( chip=chip[,2:3],
+                        mapScore=mapScore, gcScore=gcScore, nNuc=nNuc,
+                        dataType=dataType, rounding=rounding )
+
+                    chrID <- rep( chip[1,1], length(out$coord) )
+                    outBin <- new( "BinData", chrID=chrID, coord=out$coord, tagCount=out$Y,
+                        mappability=out$M, gcContent=out$GC, dataType=dataType )
+
+                    rm( chrID, out )
+                    gc()
+                } else {
+                    stop( "Numbers of columns do not match!" )
+                }
+            }
+        } else {
+                stop( "All of mappability, GC content, and sequence ambiguity should be provided for one-sample analysis!" )
+        }
+    }
+
+    message( "Info: done!\n" )
+
+    # info about preprocessing (for single chromosome data)
+
+    if( length(chrChIP) == 1 & existN ) {
+        nRatio <- length(which(nNuc[,2]==1)) / length(nNuc[,2])
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: preprocessing summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "- percentage of bins with ambiguous sequences: ",round(nRatio*100),"%\n", sep="" )
+        cat( "  (these bins will be excluded from the analysis)\n" )
+        cat( "- before preprocessing:\n" )
+        cat( "\tfirst coordinates = ",min(chip[,2]),
+            ", last coordinates = ",max(chip[,2]), "\n", sep="" )
+        cat( "- after preprocessing:\n" )
+        cat( "\tfirst coordinates = ",min(outBin@coord),
+            ", last coordinates = ",max(outBin@coord), "\n", sep="" )
+        cat( "------------------------------------------------------------\n" )
+    }
+
+    # info about preprocessing (for multiple chromosome data)
+
+    if( length(chrCommon) > 1 & existN ) {
+        cat( "------------------------------------------------------------\n" )
+        cat( "Info: preprocessing summary\n" )
+        cat( "------------------------------------------------------------\n" )
+        cat( "[Note] Bins with ambiguous sequences will be excluded from the analysis.\n" )
+        cat( "Coordinates before & after preprocessing:\n" )
+        for ( i in 1:nrow(coordMat) ) {
+            cat( chrCommon[i],": ",sep="" )
+        cat( "\t",coordMat[i,1]," - ",coordMat[i,2], "\t->", sep="" )
+        cat( "\t",coordMat[i,3]," - ",coordMat[i,4], "\n", sep="" )
+        }
+        cat( "------------------------------------------------------------\n" )
+    }
+
+    return(outBin)
+}
+
+.validType <- function( type )
+{
+    # error treatment: check invalid type
+
+    allType <- c("chip","M","GC","N","input")
+    invalidType <- TRUE
+    for ( i in 1:length(type) )
+    {
+        if ( length(which(!is.na(match(type[i],allType))))==0 )
+        {
+            invalidType <- FALSE
+        }
+    }
+    if ( !invalidType )
+    {
+        stop( "Invalid 'type'! Choose among 'chip','M','GC','N','input'!" )
+    }
+
+    # error treatment: check whether type contains at least chip+M+GC+N or chip+input
+
+    minType1 <- c("chip","M","GC","N")
+    minType2 <- c("chip","input")
+
+    notMin1 <- ( length(which(!is.na(match(minType1,type)))) < length(minType1) )
+        # minimum requirement for one-sample analysis is not satisfied
+    notMin2 <- ( length(which(!is.na(match(minType2,type)))) < length(minType2) )
+        # minimum requirement for two-sample analysis (Input only) is not satisfied
+
+    if ( notMin1 && notMin2 )
+    {
+        stop( "Minimum requirements of 'type':\nplease provide at least either c('chip','M','GC','N') or c('chip','input')!" )
+    }
+}
+
+.validLocation <- function( type, fileName )
+{
+    # error treatment: check whether 'fileName' info is provided
+
+    if ( is.null(fileName) )
+    {
+        stop( "Please provide 'fileName' information!" )
+    }
+
+    # error treatment: check whether 'fileName' info matches 'type'
+
+    if ( length(type)!=length(fileName) )
+    {
+        #print(type)
+        #print(fileName)
+        stop( "Length of 'type' & length of 'fileName' do not match!" )
+    }
+}
+
+.validDataType <- function( dataType )
+{
+    # error treatment: check invalid dataType
+
+    if ( dataType!='unique' & dataType!='multi' )
+    {
+        stop( "Invalid 'dataType'! Choose either 'unique' or 'multi'!" )
+    }
+}
+
+.processBin_MGC <- function( chip, mapScore, gcScore, nNuc, dataType, rounding )
+{
+    # choose data with min # row
+
+    minID <- which.min( c( nrow(chip), nrow(mapScore), nrow(gcScore) ) )
+    dataList <- list( chip, mapScore, gcScore )
+    dataMinID <- dataList[[ minID ]]
+    rm( dataList, minID )
+
+    # match coordinates
+
+    chip <- chip[ !is.na(match(chip[,1],dataMinID[,1])), ]
+    mapScore <- mapScore[ !is.na(match(mapScore[,1],dataMinID[,1])), ]
+    gcScore <- gcScore[ !is.na(match(gcScore[,1],dataMinID[,1])), ]
+    nNuc <- nNuc[ !is.na(match(nNuc[,1],dataMinID[,1])), ]
+
+    # exclude bins with ambiguous sequences
+
+    nRegID <- which(nNuc[,2]==1)
+    if ( length(nRegID)>0 ) {
+        chip <- chip[-nRegID,]
+        mapScore <- mapScore[-nRegID,]
+        gcScore <- gcScore[-nRegID,]
+        nNuc <- nNuc[-nRegID,]
+    }
+    coord <- chip[,1]
+
+    # round values
+
+    Y <- round(chip[,2])    # round tag count in the case of multi match
+    M <- round(mapScore[,2]*rounding)/rounding
+    denom <- 1 - nNuc[,2]
+    GC <- round(gcScore[,2]*rounding/denom)/rounding
+
+    if( dataType == 'unique' )
+    {
+        # in case of unique match
+        if( length(which(Y>0&M==0)) > 0 )
+        {
+            Y[ Y>0 & M==0 ] <- 0
+        }
+    }
+
+    return( list( coord=coord, Y=Y, M=M, GC=GC ) )
+}
+
+.processBin_X <- function( chip, input, dataType )
+{
+    # choose data with min # row
+
+    minID <- which.min( c( nrow(chip), nrow(input) ) )
+    dataList <- list( chip, input )
+    dataMinID <- dataList[[ minID ]]
+    rm( dataList, minID )
+
+    # match coordinates
+
+    chip <- chip[ !is.na(match(chip[,1],dataMinID[,1])), ]
+    input <- input[ !is.na(match(input[,1],dataMinID[,1])), ]
+    coord <- chip[,1]
+
+    # round values
+
+    Y <- round(chip[,2])    # round tag count in the case of multi match
+    X <- round(input[,2])
+
+    return( list( coord=coord, Y=Y, X=X ) )
+}
+
+.processBin_XN <- function( chip, input, nNuc, dataType )
+{
+    # choose data with min # row
+
+    minID <- which.min( c( nrow(chip), nrow(input) ) )
+    dataList <- list( chip, input )
+    dataMinID <- dataList[[ minID ]]
+    rm( dataList, minID )
+
+    # match coordinates
+
+    chip <- chip[ !is.na(match(chip[,1],dataMinID[,1])), ]
+    input <- input[ !is.na(match(input[,1],dataMinID[,1])), ]
+    nNuc <- nNuc[ !is.na(match(nNuc[,1],dataMinID[,1])), ]
+
+    # exclude bins with ambiguous sequences
+
+    nRegID <- which(nNuc[,2]==1)
+    if ( length(nRegID)>0 ) {
+        chip <- chip[-nRegID,]
+        input <- input[-nRegID,]
+        nNuc <- nNuc[-nRegID,]
+    }
+    coord <- chip[,1]
+
+    # round values
+
+    Y <- round(chip[,2])    # round tag count in the case of multi match
+    X <- round(input[,2])
+
+    return( list( coord=coord, Y=Y, X=X ) )
+}
+
+.processBin_MGCX <- function( chip, input, mapScore, gcScore, nNuc, dataType, rounding )
+{
+    # choose data with min # row
+
+    minID <- which.min( c( nrow(chip), nrow(input), nrow(mapScore), nrow(gcScore) ) )
+    dataList <- list( chip, input, mapScore, gcScore )
+    dataMinID <- dataList[[ minID ]]
+    rm( dataList, minID )
+
+    # match coordinates
+
+    chip <- chip[ !is.na(match(chip[,1],dataMinID[,1])), ]
+    input <- input[ !is.na(match(input[,1],dataMinID[,1])), ]
+    mapScore <- mapScore[ !is.na(match(mapScore[,1],dataMinID[,1])), ]
+    gcScore <- gcScore[ !is.na(match(gcScore[,1],dataMinID[,1])), ]
+    nNuc <- nNuc[ !is.na(match(nNuc[,1],dataMinID[,1])), ]
+
+    # exclude bins with ambiguous sequences
+
+    nRegID <- which(nNuc[,2]==1)
+    if ( length(nRegID)>0 ) {
+        chip <- chip[-nRegID,]
+        mapScore <- mapScore[-nRegID,]
+        gcScore <- gcScore[-nRegID,]
+        nNuc <- nNuc[-nRegID,]
+        input <- input[-nRegID,]
+    }
+    coord <- chip[,1]
+
+    # round values
+
+    Y <- round(chip[,2])    # round tag count in the case of multi match
+    M <- round(mapScore[,2]*rounding)/rounding
+    denom <- 1 - nNuc[,2]
+    GC <- round(gcScore[,2]*rounding/denom)/rounding
+    X <- round(input[,2])
+
+    if( dataType == 'unique' )
+    {
+        # in case of unique match
+        if( length(which(Y>0&M==0)) > 0 )
+        {
+            Y[ Y>0 & M==0 ] <- 0
+        }
+    }
+
+    return( list( coord=coord, Y=Y, X=X, M=M, GC=GC ) )
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/convert_csem_format.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,119 @@
+# convert output from stand-alone csem to CSEM BED format
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+
+# parse command arguments
+
+if ( scalar(@ARGV) < 6 ) {
+	print "\nUsage:\n";
+	print "perl convert_csem_format.pl [csem_file] [outfile] [outfile_format] [ref_file]\n";
+	print "\t[read_length] [pseudo_tags?]\n";
+	print "\nExample:\n";
+	print "perl convert_csem_format.pl example.csem example_final.csem bed hg19.ref 36 N\n\n";
+	exit;
+}
+
+my ( $csem_file, $outfile_name, $outfile_format, $ref_file, $read_length, $pseudo_tags ) = @ARGV;
+
+# chromosome info
+
+open IN, "$ref_file" or die "Cannot open $ref_file\n";
+
+<IN>;	# skip the first line
+
+my $size_vec = <IN>;
+chomp($size_vec);
+my @size_list = split( /\s+/, $size_vec );
+
+my $chr_vec = <IN>;
+chomp($chr_vec);
+my @chr_list = split( /\s+/, $chr_vec );
+
+close IN;
+
+# post-process chromosome & position
+
+open( IN, $csem_file ) or die "Cannot open csem file!\n";
+open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
+
+<IN>;	# skip the first line (number of uni-reads and multi-reads)
+
+while (my $line = <IN>) {
+	chomp($line);
+	my @element = split( /\s/, $line );
+		# assume columns are separated by some white space
+
+	# check for invalid line: may cause error in exporting step
+
+	if ( scalar(@element) < 5 ) {
+		next;
+	}
+
+	# post-process lines
+
+	my ($id, $chr, $str, $loc, $prob) = @element;
+	if ( $outfile_format ne "bed" ) {
+		# first base is 0 in bowtie or BED
+		# first base is 1 in table or GFF
+		$loc++;
+	}
+	my $chrname = $chr_list[$chr];	# translate chromosome
+
+	# write down processed lines
+	# - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
+
+	if ( $pseudo_tags eq "Y" ) {
+		# if we want to generate pseudo-tags,
+		# then threshold prob at 0.5 & round prob to integer (i.e., set to one)
+		# (exclude prob = 0.5 as well in order to avoid a read appears more than once)
+
+		if ( $prob <= 0.5 ) {
+			next;
+		} else {
+			$prob = 1;
+		}
+	}
+
+	# write down results
+
+	my $start;
+	my $end;
+	my $score;
+
+	#my $id_final = $readID[$id];
+	my $id_final = $id;
+
+	if ( $outfile_format eq "bed" ) {
+		# BED
+		# - name: read ID
+		# - score = prob * 1000
+
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $name = $id_final;
+		$score = $prob * 1000;
+
+		print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
+	} elsif ( $outfile_format eq "gff" ) {
+		# GFF
+		# - source: "CSEM"
+		# - feature: read ID
+		# - score = prob * 1000
+
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $source = "CSEM";
+		my $feature = $id_final;
+		$score = $prob * 1000;
+
+		print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
+	} else {
+		print "Inappropriate output file format!\n";
+		exit 1;
+	}
+}
+
+close( IN );
+close( OUT );
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/csem_wrapper.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,192 @@
+# Wrapper for CSEM with Bowtie
+# Written by Dongjun Chung, Sep. 8, 2011
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+use File::Temp qw/tempfile/;
+use File::Temp qw/tmpnam/;
+
+# parse command arguments
+
+die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
+
+my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
+
+# construct ref genome file (adapted from "genRef.pl")
+
+open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
+
+my $line;
+
+my $size = 0;
+my (@names, @lens) = ();
+	# $size: # of chromosomes
+	# @lens: chromosome size
+	# @names: chromosome name
+
+for (my $i = 0; $i < 3; $i++) {
+    # skip unnecessary lines
+    $line = <IN>;
+}
+
+while ( $line = <IN> ) {
+    ++$size;
+    chomp($line);
+    my ($seqn, $name, $len) = split(/[ \t]+/, $line);
+    push(@names, $name);
+    push(@lens, $len);
+}
+close(IN);
+
+my ($fh, $temp_reffile) = tempfile();
+print $fh "$size\n";
+print $fh "@lens\n";
+print $fh "@names\n";
+close($fh);
+
+# extract read length from FASTA/FASTQ files
+
+open( IN, $infile_name ) or die "Cannot open tag file!\n";
+
+$line = <IN>;
+if ( $infile_format eq "fasta" ) {
+	while ( $line =~ /^>/ ) {
+		$line = <IN>;
+	}
+} elsif ( $infile_format eq "fastq" ) {
+	while ( $line =~ /^@/ ) {
+		$line = <IN>;
+	}
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+chomp($line);
+my $read_length = length $line;
+
+close( IN );
+
+# extract read ID
+
+open( IN, $infile_name ) or die "Cannot open tag file!\n";
+
+my @readID = ();
+if ( $infile_format eq "fasta" ) {
+	foreach $line (<IN>) {
+		chomp($line);
+		if ( $line =~ /^>(\S+)/ ) {
+			push @readID, $1;
+		}
+	}
+} elsif ( $infile_format eq "fastq" ) {
+	foreach $line (<IN>) {
+		chomp($line);
+		if ( $line =~ /^@(\S+)/ ) {
+			push @readID, $1;
+		}
+	}
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+
+close( IN );
+
+# run bowtie & csem
+
+my $outfile_temp = tmpnam();
+
+if ( $infile_format eq "fasta" ) {
+	system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
+} elsif ( $infile_format eq "fastq" ) {
+	system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+
+# post-process chromosome & position
+
+open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
+open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
+
+foreach $line (<IN>) {
+	chomp($line);
+	my @element = split( /\s/, $line );
+		# assume columns are separated by some white space
+
+	# check for invalid line: may cause error in exporting step
+
+	if ( scalar(@element)<5 ) {
+		next;
+	}
+
+	# post-process lines
+
+	my ($id, $chr, $str, $loc, $prob) = @element;
+	if ( $outfile_format ne "bed" ) {
+		# first base is 0 in bowtie or BED
+		# first base is 1 in table or GFF
+		$loc++;
+	}
+	my $chrname = $names[$chr];	# translate chromosome
+
+	# write down processed lines
+	# - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
+
+	if ( $pseudo_tags eq "Y" ) {
+		# if we want to generate pseudo-tags,
+		# then threshold prob at 0.5 & round prob to integer (i.e., set to one)
+		# (exclude prob = 0.5 as well in order to avoid a read appears more than once)
+
+		if ( $prob <= 0.5 ) {
+			next;
+		} else {
+			$prob = 1;
+		}
+	}
+
+	# write down results
+
+	my $start;
+	my $end;
+	my $score;
+
+	my $id_final = $readID[$id];
+	#my $id_final = $id;
+
+	if ( $outfile_format eq "table" ) {
+		print OUT "$id_final\t$chrname $str $loc $prob\n";
+	} elsif ( $outfile_format eq "bed" ) {
+		# BED
+		# - name: read ID
+		# - score = prob * 1000
+
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $name = $id_final;
+		$score = $prob * 1000;
+
+		print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
+	} elsif ( $outfile_format eq "gff" ) {
+		# GFF
+		# - source: "CSEM"
+		# - feature: read ID
+		# - score = prob * 1000
+
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $source = "CSEM";
+		my $feature = $id_final;
+		$score = $prob * 1000;
+
+		print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
+	} else {
+	print "Inappropriate output file format!\n";
+	exit 1;
+}
+}
+
+close( IN );
+close( OUT );
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/process_readfiles_PET.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,367 @@
+###################################################################
+#
+#   Process read files into bin-level files (PET)
+#
+#   Command arguments:
+#   - infile: Input file (directory + file name)
+#   - outdir: Directory of output files
+#   - format: File format
+#   - binsize: Bin size
+#   - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
+#   - bychr: Construct bin-level files by chromosome? (Y or N)
+#	- chrinfo: Is the file for chromosome info provided?
+#	- chrfile: File name for chromosome info (chr size)
+#	- @excludeChr: Chromosomes to be excluded (vector)
+#
+#   Supported file format:
+#   - eland_result, sam
+#
+#   Note
+#   - chromosomes are extracted from read files, after excluding invalid lines
+#   - uni-reads are assumed
+#
+###################################################################
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+use Cwd;
+use File::Basename;
+
+my ($infile, $outdir, $format, $binsize, $collapse, $bychr,
+	$chrinfo, $chrfile, @exclude_chr) = @ARGV;
+
+# extract only filename from $infile
+
+my @pr = fileparse( $infile );
+my $filename = $pr[0];
+
+# remember current working directory
+
+my $pwd = cwd();
+
+# construct bin-level data based on chromsome info, if provided
+
+my %bin_count = ();
+my $bin_start = 0;
+my $bin_stop = 0;
+
+if ( $chrinfo eq "Y" ) {
+	open CHR, "$chrfile" or die "Cannot open $chrfile\n";
+
+	while (<CHR>) {
+		chomp;
+		my ( $chrname, $chrsize ) = split /\s+/, $_;
+
+		$bin_start = 0;
+		$bin_stop = int($chrsize/$binsize);
+
+		for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+			${$bin_count{$chrname}}[$i] = 0;
+		}
+	}
+
+	close CHR;
+}
+
+# chromosomes to be excluded
+
+my $ecyn;
+my %ec_hash;
+
+if ( scalar(@exclude_chr) == 0 ) {
+	$ecyn = "N";
+} else {
+	$ecyn = "Y";
+	@ec_hash{ @exclude_chr } = 0;
+}
+
+# process PET read file
+
+open IN, "$infile" or die "Cannot open input file $infile\n";
+
+my %seen =();
+
+my %seen_pos =();		# position (leftmost position, starting from 1)
+my %seen_str = ();		# strand ('F' or 'R')
+
+my $start;
+my $end;
+
+while(<IN>){
+	# parse
+
+	chomp;
+
+	# procee read file, based on "format" option
+
+	my @parsed;
+
+	if ( $format eq "eland_result" ) {
+		@parsed = eland_result( $_ );
+	} elsif ( $format eq "sam" ) {
+		@parsed = sam( $_ );
+	} else {
+        	# Unsupported file format -> exit and return 1 to environment
+
+        	exit 1;
+	}
+
+	# skip if invalid line
+
+	next if ( $parsed[0] == 0 );
+
+	# otherwise, process it
+
+	my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed;
+	my ($id_only, $pair) = split /\//, $id;
+
+	# skip if chrt \in @exclude_chr
+
+	if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
+		next;
+	}
+
+	# process coordinates
+
+	if ( exists $seen_pos{$id_only} ) {
+		# if other pair was already observed (w.r.t. id),
+		# then write the coordinates of both reads in the pair
+
+		if ( $seen_str{$id_only} ne $str ) {
+			# pos1 & pos2 should have different strands and pair numbers
+
+			my $pos1 = $seen_pos{$id_only};
+			my $pos2 = $pos;
+
+			# in the pair, write upstream pos first,
+			# then downstream pos + read length - 1.
+			# [Note] eland_result: pos = leftmost position, starting from 1.
+
+			#if ( $pos1 < $pos2 && $str eq "R" ) {
+			#	$pos2 += $read_length - 1;
+			#	$start = $pos1;
+			#	$end = $pos2;
+			#} elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) {
+			#	$pos1 += $read_length - 1;
+			#	$start = $pos2;
+			#	$end = $pos1;
+			#} else {
+			#	print "check id $id!\n";
+			#}
+			if ( $str eq "R" ) {
+				$pos2 += $read_length - 1;
+				$start = $pos1;
+				$end = $pos2;
+			} elsif ( $seen_str{$id_only} eq "R" ) {
+				$pos1 += $read_length - 1;
+				$start = $pos2;
+				$end = $pos1;
+			} else {
+				print "check id $id!\n";
+			}
+
+		    # process to bin-level files if collapse condition is satisfied
+
+		    my $id_collapse = join("",$chrt,$start,$end);
+		    $seen{$id_collapse}++;
+
+		    if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
+		        next;
+		    }
+
+			# update bin count
+
+			if ( exists $bin_count{$chrt} ) {
+				# if there is already a matrix for chromosome, update it
+
+				$bin_start = int($start/$binsize) ;
+				$bin_stop = int($end/$binsize) ;
+				for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+					${$bin_count{$chrt}}[$i] += $prob;
+				}
+			} else {
+				# if there is no matrix for chromosome yet,
+
+				if ( $chrinfo eq "Y" ) {
+					# if chr info is provided, do not construct new one
+
+					next;
+				} else {
+					# if no chr info is provided, construct one
+
+					@{$bin_count{$chrt}} = ();
+					$bin_start = int($start/$binsize) ;
+					$bin_stop = int($end/$binsize) ;
+					for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+						${$bin_count{$chrt}}[$i] += $prob;
+					}
+				}
+			}
+
+			# delete the key to save memory
+
+			delete $seen_pos{$id_only};
+			delete $seen_str{$id_only};
+		} else {
+			print "inappropriate pair for id $id.\n";
+		}
+	} else {
+		# if other pair was not observed yet, record it
+
+		$seen_pos{$id_only} = $pos;
+		$seen_str{$id_only} = $str;
+	}
+}
+
+close( IN );
+
+# move to output directory
+
+chdir($outdir);
+
+# write bin-level files
+
+if ( $bychr eq "N" ) {
+    # genome-wide version: all chromosome in one file
+
+    my $outfile = $filename."_bin".$binsize.".txt";
+    open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+    foreach my $chr_id (keys %bin_count) {
+        my @bin_count_chr = @{$bin_count{$chr_id}};
+
+        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
+            my $coord = $i*$binsize;
+            if ( $bin_count_chr[$i] ) {
+                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
+            } else {
+                print OUT "$chr_id\t$coord\t0\n";
+            }
+        }
+    }
+
+    close OUT;
+} else {
+    # chromosome version: one chromosome in each file
+
+    foreach my $chr_id (keys %bin_count) {
+        #my $outfile = $chr_id."_".$filename."_bin".$binsize.".txt";
+        my $outfile = $filename."_bin".$binsize."_".$chr_id.".txt";
+        open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+        my @bin_count_chr = @{$bin_count{$chr_id}};
+
+        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
+            my $coord = $i*$binsize;
+            if ( $bin_count_chr[$i] ) {
+                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
+            }
+            else {
+                print OUT "$chr_id\t$coord\t0\n";
+            }
+        }
+
+        close OUT;
+    }
+}
+
+
+################################################################################
+#                                                                              #
+#                              subroutines                                     #
+#                                                                              #
+################################################################################
+
+# -------------------------- subroutine: eland result --------------------------
+
+sub eland_result {
+    my $line = shift;
+
+    # parse line
+
+    my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
+    my $read_length = length $seq;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    my @pos_sets = split( /,/, $pos );
+    if ( scalar(@pos_sets) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: SAM -----------------------------------
+
+sub sam {
+    my $line = shift;
+
+    # parse line
+
+    if ( $line =~ /^[^@].+/ ) {
+        # exclude lines starting with "@" (comments)
+
+        my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
+        $pos = int($pos);
+        my $read_length = length $seq;
+
+        # strand
+
+        my $str;
+
+        if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
+            # exclude invalid lines
+
+            my @status = 0;
+            return @status;
+        } elsif ( $bwflag & 16 ) {
+            # reverse strand
+
+            $str = "R";
+        } else {
+            $str = "F";
+        }
+
+        # exclude multi-reads?
+
+        # return
+
+        my $status = 1;
+    	my $prob = 1;
+        my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
+        return @parsed;
+    } else {
+        my @status = 0;
+        return @status;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/process_readfiles_SET.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,566 @@
+###################################################################
+#
+#   Process read files into bin-level files
+#
+#   Command arguments:
+#   - infile: Input file (directory + file name)
+#   - outdir: Directory of output files
+#   - format: File format
+#   - L: Expected fragment length
+#   - binsize: Bin size
+#   - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
+#   - bychr: Construct bin-level files by chromosome? (Y or N)
+#   - chrinfo: Is the file for chromosome info provided?
+#   - chrfile: File name for chromosome info (chr size)
+#   - @excludeChr: Chromosomes to be excluded (vector)
+#
+#   Supported file format:
+#   - eland_result, eland_extended, eland_export, bowtie, sam, bed, csem
+#
+#   Note
+#   - chromosomes are extracted from read files, after excluding invalid lines
+#   - uni-reads are assumed
+#
+###################################################################
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+use Cwd;
+use File::Basename;
+
+my ($infile, $outdir, $format, $L, $binsize, $collapse, $bychr,
+	$chrinfo, $chrfile, @exclude_chr) = @ARGV;
+
+# extract only filename from $infile
+
+my @pr = fileparse( $infile );
+my $filename = $pr[0];
+
+# remember current working directory
+
+my $pwd = cwd();
+
+# construct bin-level data based on chromsome info, if provided
+
+my %bin_count = ();
+my $bin_start = 0;
+my $bin_stop = 0;
+
+if ( $chrinfo eq "Y" ) {
+	open CHR, "$chrfile" or die "Cannot open $chrfile\n";
+
+	while (<CHR>) {
+		chomp;
+		my ( $chrname, $chrsize ) = split /\s+/, $_;
+
+		$bin_start = 0;
+	    $bin_stop = int( ($chrsize + $L - 1 ) / $binsize );
+
+		for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+			${$bin_count{$chrname}}[$i] = 0;
+		}
+	}
+
+	close CHR;
+}
+
+# chromosomes to be excluded
+
+my $ecyn;
+my %ec_hash;
+
+if ( scalar(@exclude_chr) == 0 ) {
+	$ecyn = "N";
+} else {
+	$ecyn = "Y";
+	@ec_hash{ @exclude_chr } = 0;
+}
+
+# load read file and process it
+# (chromosome information is extracted from read file)
+
+open IN, "$infile" or die "Cannot open $infile\n";
+
+my %seen =();
+
+while(<IN>){
+    chomp;
+
+    # procee read file, based on "format" option
+
+    my @parsed;
+
+    if ( $format eq "eland_result" ) {
+        @parsed = eland_result( $_, $L );
+    } elsif ( $format eq "eland_extended" ) {
+        @parsed = eland_extended( $_, $L );
+    } elsif ( $format eq "eland_export" ) {
+        @parsed = eland_export( $_, $L );
+    } elsif ( $format eq "bowtie" ) {
+        @parsed = bowtie( $_, $L );
+    } elsif ( $format eq "sam" ) {
+        @parsed = sam( $_, $L );
+    } elsif ( $format eq "bed" ) {
+        @parsed = bed( $_, $L );
+    } elsif ( $format eq "csem" ) {
+        @parsed = csem( $_, $L );
+    } else {
+        # Unsupported file format -> exit and return 1 to R environment
+
+        exit 1;
+    }
+
+    # skip if invalid line
+
+    next if ( $parsed[0] == 0 );
+
+    # otherwise, process it
+
+    my ($status, $chrt, $pos, $str, $L_tmp, $read_length, $prob ) = @parsed;
+
+    # skip if chrt \in @exclude_chr
+
+    if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
+	    next;
+    }
+
+    # process to bin-level files if collapse condition is satisfied
+
+    my $id_collapse = join("",$chrt,$pos,$str);
+    $seen{$id_collapse}++;
+
+    if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
+        next;
+    }
+
+    # adjust position if it is reverse strand
+
+    $pos = $pos + $read_length - $L if ( $str eq "R" );
+    if ( $pos <= 0 ) {
+	    $pos = 1;
+    }
+
+    # update bin count
+
+    if ( exists $bin_count{$chrt} ) {
+        # if there is already a matrix for chromosome, update it
+
+        $bin_start = int($pos/$binsize) ;
+        $bin_stop = int(($pos + $L_tmp - 1 )/$binsize) ;
+        for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+            ${$bin_count{$chrt}}[$i] += $prob;
+        }
+    } else {
+        # if there is no matrix for chromosome yet,
+
+        if ( $chrinfo eq "Y" ) {
+	        # if chr info is provided, do not construct new one
+
+	        next;
+        } else {
+	        # if no chr info is provided, construct one
+
+	        @{$bin_count{$chrt}} = ();
+	        $bin_start = int($pos/$binsize) ;
+	        $bin_stop = int(($pos + $L_tmp - 1 )/$binsize) ;
+	        for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+	            ${$bin_count{$chrt}}[$i] += $prob;
+	        }
+        }
+    }
+}
+
+close IN;
+
+# move to output directory
+
+chdir($outdir);
+
+# write bin-level files
+
+if ( $bychr eq "N" ) {
+    # genome-wide version: all chromosome in one file
+
+    my $outfile = $filename."_fragL".$L."_bin".$binsize.".txt";
+    open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+    foreach my $chr_id (keys %bin_count) {
+        my @bin_count_chr = @{$bin_count{$chr_id}};
+
+        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
+            my $coord = $i*$binsize;
+            if ( $bin_count_chr[$i] ) {
+                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
+            } else {
+                print OUT "$chr_id\t$coord\t0\n";
+            }
+        }
+    }
+
+    close OUT;
+} else {
+    # chromosome version: one chromosome in each file
+
+    foreach my $chr_id (keys %bin_count) {
+        #my $outfile = $chr_id."_".$filename."_fragL".$L."_bin".$binsize.".txt";
+        my $outfile = $filename."_fragL".$L."_bin".$binsize."_".$chr_id.".txt";
+        open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+        my @bin_count_chr = @{$bin_count{$chr_id}};
+
+        for( my $i = 0; $i< scalar(@bin_count_chr); $i++ ){
+            my $coord = $i*$binsize;
+            if ( $bin_count_chr[$i] ) {
+                print OUT "$chr_id\t$coord\t$bin_count_chr[$i]\n";
+            }
+            else {
+                print OUT "$chr_id\t$coord\t0\n";
+            }
+        }
+
+        close OUT;
+    }
+}
+
+
+################################################################################
+#                                                                              #
+#                              subroutines                                     #
+#                                                                              #
+################################################################################
+
+# -------------------------- subroutine: eland result --------------------------
+
+sub eland_result {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    my @pos_sets = split( /,/, $pos );
+    if ( scalar(@pos_sets) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: eland extended ------------------------
+
+sub eland_extended {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $seq, $map, $map_result) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" || $map eq "RM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map ne "1:0:0" && $map ne "0:1:0" && $map ne "0:0:1" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # process chromosome, position, and strand
+
+    $map_result =~ m/(chr\w+.fa):(\d+)([FR])/;
+        # use "chr\w+.fa" in order to cover chrX & chrY as well
+    my $chrt = $1;
+    my $pos = $2;
+    my $str = $3;
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: eland export --------------------------
+
+sub eland_export {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $t2, $t3, $t4, $t5, $t6, $t7, $t8, $seq, $t9, $chrt, $pos, $str, @rest) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    my @chrt_split = split( /\:/, $chrt );
+    if ( scalar(@chrt_split) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: bowtie default ------------------------
+
+sub bowtie {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ( $read_idx, $str_org, $chrt, $pos, $seq, @rest ) = split /\t/, $line;
+
+    # exclude invalid lines?
+    # exclude multi-reads?
+
+    # pos: 0-based offset -> adjust
+
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: SAM -----------------------------------
+
+sub sam {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    if ( $line =~ /^[^@].+/ ) {
+        # exclude lines starting with "@" (comments)
+
+        my ($t1, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
+        $pos = int($pos);
+        my $str;
+
+        if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
+            # exclude invalid lines
+
+            my @status = 0;
+            return @status;
+        } elsif ( $bwflag & 16 ) {
+            # reverse strand
+
+            $str = "R";
+        } else {
+            $str = "F";
+        }
+
+        # exclude multi-reads?
+
+        # fragment length adjustment
+
+        my $read_length = length $seq;
+        my $L_tmp = $L;
+        #if ( $L < $read_length ) {
+        #    $L_tmp = $read_length;
+        #}
+
+        # return
+
+        my $status = 1;
+    	my $prob = 1;
+        my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+        return @parsed;
+    } else {
+        my @status = 0;
+        return @status;
+    }
+}
+
+# -------------------------- subroutine: BED --------------------------
+
+sub bed {
+    my $line = shift;
+    my $L = shift;
+
+    # skip track line
+
+    my ($element1, @element2) = split /\t/, $line;
+
+    if ( $element1 eq "track" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # parse line
+
+    my ($chrt, $start, $end, $t1, $t2, $str_org, @rest) = split /\t/, $line;
+
+    # pos: 0-based offset -> adjust
+
+    my $pos = $start;
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = $end - $start + 1;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: CSEM --------------------------
+
+sub csem {
+    my $line = shift;
+    my $L = shift;
+
+    # skip track line
+
+    my ($element1, @element2) = split /\t/, $line;
+
+    if ( $element1 eq "track" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # parse line
+
+    my ($chrt, $start, $end, $t1, $score, $str_org, @rest) = split /\t/, $line;
+
+    # pos: 0-based offset -> adjust
+
+    my $pos = $start;
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = $end - $start + 1;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = $score / 1000;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/readfile2wig_PET.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,359 @@
+###################################################################
+#
+#   Process read files into WIG files
+#
+#   Command arguments:
+#   - infile: Input file (directory + file name)
+#   - outdir: Directory of output files
+#   - format: File format
+#   - span: wig span
+#	- norm_const: normalizing constant
+#   - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
+#   - bychr: Construct bin-level files by chromosome? (Y or N)
+#	- chrinfo: Is the file for chromosome info provided?
+#	- chrfile: File name for chromosome info (chr size)
+#	- @excludeChr: Chromosomes to be excluded (vector)
+#
+#   Supported file format:
+#   - eland_result, sam
+#
+#   Note
+#   - chromosomes are extracted from read files, after excluding invalid lines
+#   - uni-reads are assumed
+#
+###################################################################
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+use Cwd;
+use File::Basename;
+
+my ($infile, $outdir, $format, $span, $norm_const, $collapse, $bychr,
+	$chrinfo, $chrfile, @exclude_chr) = @ARGV;
+
+# extract only filename from $infile
+
+my @pr = fileparse( $infile );
+my $filename = $pr[0];
+
+# remember current working directory
+
+my $pwd = cwd();
+
+# construct bin-level data based on chromsome info, if provided
+
+my %bin_count = ();
+my $bin_start = 0;
+my $bin_stop = 0;
+
+if ( $chrinfo eq "Y" ) {
+	open CHR, "$chrfile" or die "Cannot open $chrfile\n";
+
+	while (<CHR>) {
+		chomp;
+		my ( $chrname, $chrsize ) = split /\s+/, $_;
+
+		$bin_start = 0;
+		$bin_stop = int($chrsize/$span);
+
+		for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+			${$bin_count{$chrname}}[$i] = 0;
+		}
+	}
+
+	close CHR;
+}
+
+# chromosomes to be excluded
+
+my $ecyn;
+my %ec_hash;
+
+if ( scalar(@exclude_chr) == 0 ) {
+	$ecyn = "N";
+} else {
+	$ecyn = "Y";
+	@ec_hash{ @exclude_chr } = 0;
+}
+
+# process PET read file
+# (chromosome information is extracted from read file)
+
+open IN, "$infile" or die "Cannot open input file $infile\n";
+
+my %seen =();
+
+my %seen_pos =();		# position (leftmost position, starting from 1)
+my %seen_str = ();		# strand ('F' or 'R')
+
+my $start;
+my $end;
+
+while(<IN>){
+	# parse
+
+	chomp;
+
+	# procee read file, based on "format" option
+
+	my @parsed;
+
+	if ( $format eq "eland_result" ) {
+		@parsed = eland_result( $_ );
+	} elsif ( $format eq "sam" ) {
+		@parsed = sam( $_ );
+	} else {
+        	# Unsupported file format -> exit and return 1 to environment
+
+        	exit 1;
+	}
+
+	# skip if invalid line
+
+	next if ( $parsed[0] == 0 );
+
+	# otherwise, process it
+
+	my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed;
+	my ($id_only, $pair) = split /\//, $id;
+
+	# skip if chrt \in @exclude_chr
+
+	if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
+		next;
+	}
+
+	# process coordinates
+
+	if ( exists $seen_pos{$id_only} ) {
+		# if other pair was already observed (w.r.t. id),
+		# then write the coordinates of both reads in the pair
+
+		if ( $seen_str{$id_only} ne $str ) {
+			# pos1 & pos2 should have different strands and pair numbers
+
+			my $pos1 = $seen_pos{$id_only};
+			my $pos2 = $pos;
+
+			# in the pair, write upstream pos first,
+			# then downstream pos + read length - 1.
+			# [Note] eland_result: pos = leftmost position, starting from 1.
+
+			if ( $pos1 < $pos2 && $str eq "R" ) {
+				$pos2 += $read_length - 1;
+				$start = $pos1;
+				$end = $pos2;
+			} elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) {
+				$pos1 += $read_length - 1;
+				$start = $pos2;
+				$end = $pos1;
+			} else {
+				print "check id $id!\n";
+			}
+
+		    # process to bin-level files if collapse condition is satisfied
+
+		    my $id_collapse = join("",$chrt,$start,$end);
+		    $seen{$id_collapse}++;
+
+		    if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
+		        next;
+		    }
+
+			# update bin count
+
+			if ( exists $bin_count{$chrt} ) {
+				# if there is already a matrix for chromosome, update it
+
+				$bin_start = int(($start-1)/$span) ;
+				$bin_stop = int(($end-1)/$span) ;
+				for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+					${$bin_count{$chrt}}[$i] += 1;
+				}
+			} else {
+				# if there is no matrix for chromosome yet, construct one
+
+				@{$bin_count{$chrt}} = ();
+				$bin_start = int(($start-1)/$span) ;
+				$bin_stop = int(($end-1)/$span) ;
+				for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+					${$bin_count{$chrt}}[$i] += 1;
+				}
+			}
+
+			# delete the key to save memory
+
+			delete $seen_pos{$id_only};
+			delete $seen_str{$id_only};
+		} else {
+			print "inappropriate pair for id $id.\n";
+		}
+	} else {
+		# if other pair was not observed yet, record it
+
+		$seen_pos{$id_only} = $pos;
+		$seen_str{$id_only} = $str;
+	}
+}
+
+close( IN );
+
+# move to output directory
+
+chdir($outdir);
+
+# write bin-level files
+
+if ( $bychr eq "N" ) {
+    # genome-wide version: all chromosome in one file
+
+	my $outfile = $filename."_span".$span.".wig";
+	open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+	print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
+	print OUT "description=\"".$outfile."\"\n";
+
+	foreach my $chr_id (keys %bin_count) {
+	    my @bin_count_chr = @{$bin_count{$chr_id}};
+	    print OUT "variableStep chrom=$chr_id span=$span\n";
+
+	    for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
+	        my $coord = $i*$span + 1;
+	        if ( $bin_count_chr[$i] ) {
+	        	my $count_final = $bin_count_chr[$i] * $norm_const;
+		    	print OUT "$coord $count_final\n";
+	        } else {
+	            print OUT "$coord 0\n";
+	        }
+	    }
+	}
+
+	close OUT;
+} else {
+    # chromosome version: one chromosome in each file
+
+	foreach my $chr_id (keys %bin_count) {
+		my $outfile = $filename."_span".$span."_".$chr_id.".wig";
+		open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+		print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
+		print OUT "description=\"".$outfile."\"\n";
+
+	    my @bin_count_chr = @{$bin_count{$chr_id}};
+
+	    print OUT "variableStep chrom=$chr_id span=$span\n";
+
+	    for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
+	        my $coord = $i*$span + 1;
+	        if ( $bin_count_chr[$i] ) {
+	        	my $count_final = $bin_count_chr[$i] * $norm_const;
+		    	print OUT "$coord $count_final\n";
+	        } else {
+	            print OUT "$coord 0\n";
+	        }
+	    }
+	}
+
+	close OUT;
+}
+
+
+################################################################################
+#                                                                              #
+#                              subroutines                                     #
+#                                                                              #
+################################################################################
+
+# -------------------------- subroutine: eland result --------------------------
+
+sub eland_result {
+    my $line = shift;
+
+    # parse line
+
+    my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
+    my $read_length = length $seq;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    my @pos_sets = split( /,/, $pos );
+    if ( scalar(@pos_sets) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: SAM -----------------------------------
+
+sub sam {
+    my $line = shift;
+
+    # parse line
+
+    if ( $line =~ /^[^@].+/ ) {
+        # exclude lines starting with "@" (comments)
+
+        my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
+        $pos = int($pos);
+        my $read_length = length $seq;
+
+        # strand
+
+        my $str;
+
+        if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
+            # exclude invalid lines
+
+            my @status = 0;
+            return @status;
+        } elsif ( $bwflag & 16 ) {
+            # reverse strand
+
+            $str = "R";
+        } else {
+            $str = "F";
+        }
+
+        # exclude multi-reads?
+
+        # return
+
+        my $status = 1;
+    	my $prob = 1;
+        my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
+        return @parsed;
+    } else {
+        my @status = 0;
+        return @status;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/Perl/readfile2wig_SET.pl	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,569 @@
+###################################################################
+#
+#   Process read files into WIG files
+#
+#   Command arguments:
+#   - infile: Input file (directory + file name)
+#   - outdir: Directory of output files
+#   - format: File format
+#   - span: wig span
+#	- norm_const: normalizing constant
+#   - L: Expected fragment length
+#   - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
+#   - bychr: Construct bin-level files by chromosome? (Y or N)
+#	- chrinfo: Is the file for chromosome info provided?
+#	- chrfile: File name for chromosome info (chr size)
+#	- @excludeChr: Chromosomes to be excluded (vector)
+#
+#   Supported file format:
+#   - eland_result, eland_extended, eland_export, bowtie, sam, bed, csem
+#
+#   Note
+#   - chromosomes are extracted from read files, after excluding invalid lines
+#   - uni-reads are assumed
+#
+###################################################################
+
+#!/usr/bin/env perl;
+use warnings;
+use strict;
+use Cwd;
+use File::Basename;
+
+my ($infile, $outdir, $format, $span, $norm_const, $L, $collapse, $bychr,
+	$chrinfo, $chrfile, @exclude_chr) = @ARGV;
+
+# extract only filename from $infile
+
+my @pr = fileparse( $infile );
+my $filename = $pr[0];
+
+# remember current working directory
+
+my $pwd = cwd();
+
+# construct bin-level data based on chromsome info, if provided
+
+my %bin_count = ();
+my $bin_start = 0;
+my $bin_stop = 0;
+
+if ( $chrinfo eq "Y" ) {
+	open CHR, "$chrfile" or die "Cannot open $chrfile\n";
+
+	while (<CHR>) {
+		chomp;
+		my ( $chrname, $chrsize ) = split /\s+/, $_;
+
+		$bin_start = 0;
+		$bin_stop = int($chrsize/$span);
+
+		for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+			${$bin_count{$chrname}}[$i] = 0;
+		}
+	}
+
+	close CHR;
+}
+
+# chromosomes to be excluded
+
+my $ecyn;
+my %ec_hash;
+
+if ( scalar(@exclude_chr) == 0 ) {
+	$ecyn = "N";
+} else {
+	$ecyn = "Y";
+	@ec_hash{ @exclude_chr } = 0;
+}
+
+# load read file and process it
+# (chromosome information is extracted from read file)
+
+open IN, "$infile" or die "Cannot open $infile\n";
+
+my %seen =();
+
+while(<IN>){
+    chomp;
+
+    # procee read file, based on "format" option
+
+    my @parsed;
+
+    if ( $format eq "eland_result" ) {
+        @parsed = eland_result( $_, $L );
+    } elsif ( $format eq "eland_extended" ) {
+        @parsed = eland_extended( $_, $L );
+    } elsif ( $format eq "eland_export" ) {
+        @parsed = eland_export( $_, $L );
+    } elsif ( $format eq "bowtie" ) {
+        @parsed = bowtie( $_, $L );
+    } elsif ( $format eq "sam" ) {
+        @parsed = sam( $_, $L );
+    } elsif ( $format eq "bed" ) {
+        @parsed = bed( $_, $L );
+    } elsif ( $format eq "csem" ) {
+        @parsed = csem( $_, $L );
+    } else {
+        # Unsupported file format -> exit and return 1 to R environment
+
+        exit 1;
+    }
+
+    # skip if invalid line
+
+    next if ( $parsed[0] == 0 );
+
+    # otherwise, process it
+
+    my ($status, $chrt, $pos, $str, $L_tmp, $read_length, $prob ) = @parsed;
+
+    # skip if chrt \in @exclude_chr
+
+    if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
+	    next;
+    }
+
+    # process to bin-level files if collapse condition is satisfied
+
+    my $id_collapse = join("",$chrt,$pos,$str);
+    $seen{$id_collapse}++;
+
+    if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
+        next;
+    }
+
+    # adjust position if it is reverse strand
+
+    $pos = $pos + $read_length - $L if ( $str eq "R" );
+    if ( $pos <= 0 ) {
+	    $pos = 1;
+    }
+
+    # update bin count
+
+    if ( exists $bin_count{$chrt} ) {
+        # if there is already a matrix for chromosome, update it
+
+        $bin_start = int(($pos-1)/$span) ;
+        $bin_stop = int(($pos + $L_tmp - 1 - 1 )/$span) ;
+        for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+            ${$bin_count{$chrt}}[$i] += $prob;
+        }
+    } else {
+        # if there is no matrix for chromosome yet, construct one
+
+        @{$bin_count{$chrt}} = ();
+        $bin_start = int(($pos-1)/$span) ;
+        $bin_stop = int(($pos + $L_tmp - 1 - 1 )/$span) ;
+        for (my $i = $bin_start; $i <= $bin_stop; $i++) {
+            ${$bin_count{$chrt}}[$i] += $prob;
+        }
+    }
+}
+
+close IN;
+
+# move to output directory
+
+chdir($outdir);
+
+# write bin-level files
+
+if ( $bychr eq "N" ) {
+    # genome-wide version: all chromosome in one file
+
+	my $outfile = $filename."_fragL".$L."_span".$span.".wig";
+	open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+	print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
+	print OUT "description=\"".$outfile."\"\n";
+
+	foreach my $chr_id (keys %bin_count) {
+	    my @bin_count_chr = @{$bin_count{$chr_id}};
+
+	    print OUT "variableStep chrom=$chr_id span=$span\n";
+
+	    for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
+	        my $coord = $i*$span + 1;
+	        if ( $bin_count_chr[$i] ) {
+		    	my $count_final = $bin_count_chr[$i] * $norm_const;
+		    	print OUT "$coord $count_final\n";
+	        } else {
+	            print OUT "$coord 0\n";
+	        }
+	    }
+	}
+
+	close OUT;
+} else {
+    # chromosome version: one chromosome in each file
+
+	foreach my $chr_id (keys %bin_count) {
+		my $outfile = $filename."_fragL".$L."_span".$span."_".$chr_id.".wig";
+		open OUT, ">$outfile" or die "Cannot open $outfile\n";
+
+		print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
+		print OUT "description=\"".$outfile."\"\n";
+
+	    my @bin_count_chr = @{$bin_count{$chr_id}};
+
+	    print OUT "variableStep chrom=$chr_id span=$span\n";
+
+	    for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
+	        my $coord = $i*$span + 1;
+	        if ( $bin_count_chr[$i] ) {
+		    	my $count_final = $bin_count_chr[$i] * $norm_const;
+		    	print OUT "$coord $count_final\n";
+	        } else {
+	            print OUT "$coord 0\n";
+	        }
+	    }
+	}
+
+	close OUT;
+}
+
+
+################################################################################
+#                                                                              #
+#                              subroutines                                     #
+#                                                                              #
+################################################################################
+
+# -------------------------- subroutine: eland result --------------------------
+
+sub eland_result {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    my @pos_sets = split( /,/, $pos );
+    if ( scalar(@pos_sets) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: eland extended ------------------------
+
+sub eland_extended {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $seq, $map, $map_result) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $map eq "QC" || $map eq "NM" || $map eq "RM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    if ( $map ne "1:0:0" && $map ne "0:1:0" && $map ne "0:0:1" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # process chromosome, position, and strand
+
+    $map_result =~ m/(chr\w+.fa):(\d+)([FR])/;
+        # use "chr\w+.fa" in order to cover chrX & chrY as well
+    my $chrt = $1;
+    my $pos = $2;
+    my $str = $3;
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: eland export --------------------------
+
+sub eland_export {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ($t1, $t2, $t3, $t4, $t5, $t6, $t7, $t8, $seq, $t9, $chrt, $pos, $str, @rest) = split /\t/, $line;
+
+    # exclude invalid lines
+
+    if ( $chrt eq "QC" || $chrt eq "NM" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # exclude multi-reads
+
+    my @chrt_split = split( /\:/, $chrt );
+    if ( scalar(@chrt_split) > 1 ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: bowtie default ------------------------
+
+sub bowtie {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    my ( $read_idx, $str_org, $chrt, $pos, $seq, @rest ) = split /\t/, $line;
+
+    # exclude invalid lines?
+    # exclude multi-reads?
+
+    # pos: 0-based offset -> adjust
+
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = length $seq;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: SAM -----------------------------------
+
+sub sam {
+    my $line = shift;
+    my $L = shift;
+
+    # parse line
+
+    if ( $line =~ /^[^@].+/ ) {
+        # exclude lines starting with "@" (comments)
+
+        my ($t1, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
+        $pos = int($pos);
+        my $str;
+
+        if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
+            # exclude invalid lines
+
+            my @status = 0;
+            return @status;
+        } elsif ( $bwflag & 16 ) {
+            # reverse strand
+
+            $str = "R";
+        } else {
+            $str = "F";
+        }
+
+        # exclude multi-reads?
+
+        # fragment length adjustment
+
+        my $read_length = length $seq;
+        my $L_tmp = $L;
+        #if ( $L < $read_length ) {
+        #    $L_tmp = $read_length;
+        #}
+
+        # return
+
+        my $status = 1;
+    	my $prob = 1;
+        my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+        return @parsed;
+    } else {
+        my @status = 0;
+        return @status;
+    }
+}
+
+# -------------------------- subroutine: BED --------------------------
+
+sub bed {
+    my $line = shift;
+    my $L = shift;
+
+    # skip track line
+
+    my ($element1, @element2) = split /\t/, $line;
+
+    if ( $element1 eq "track" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # parse line
+
+    my ($chrt, $start, $end, $t1, $t2, $str_org, @rest) = split /\t/, $line;
+
+    # pos: 0-based offset -> adjust
+
+    my $pos = $start;
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = $end - $start + 1;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = 1;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
+
+# -------------------------- subroutine: CSEM --------------------------
+
+sub csem {
+    my $line = shift;
+    my $L = shift;
+
+    # skip track line
+
+    my ($element1, @element2) = split /\s+/, $line;
+
+    if ( $element1 eq "track" ) {
+        my @status = 0;
+        return @status;
+    }
+
+    # parse line
+
+    my ($chrt, $start, $end, $t1, $score, $str_org, @rest) = split /\t/, $line;
+
+    # pos: 0-based offset -> adjust
+
+    my $pos = $start;
+    $pos++;
+
+    # str: "+" and "-" -> convert to "F" and "R", respectively
+
+    my $str;
+    if ( $str_org eq "+" ) {
+        $str = "F";
+    } elsif ( $str_org eq "-" ) {
+        $str = "R";
+    }
+
+    # fragment length adjustment
+
+    my $read_length = $end - $start + 1;
+    my $L_tmp = $L;
+    #if ( $L < $read_length ) {
+    #    $L_tmp = $read_length;
+    #}
+
+    # return
+
+    my $status = 1;
+    my $prob = $score / 1000;
+    my @parsed = ( $status, $chrt, $pos, $str, $L_tmp, $read_length, $prob );
+    return @parsed;
+}
Binary file mosaics/inst/doc/Figure4a.pdf has changed
Binary file mosaics/inst/doc/Figure4b.pdf has changed
Binary file mosaics/inst/doc/Figure5a.pdf has changed
Binary file mosaics/inst/doc/Figure5b.pdf has changed
Binary file mosaics/inst/doc/Figure5c.pdf has changed
Binary file mosaics/inst/doc/Figure5d.pdf has changed
Binary file mosaics/inst/doc/GOF_matchLow.pdf has changed
Binary file mosaics/inst/doc/GOF_rMOM.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/inst/doc/mosaics-example.Rnw	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,982 @@
+% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-
+\documentclass[11pt]{article}
+%% Set my margins
+\setlength{\oddsidemargin}{0.0truein}
+\setlength{\evensidemargin}{0.0truein}
+\setlength{\textwidth}{6.5truein}
+\setlength{\topmargin}{0.0truein}
+\setlength{\textheight}{9.0truein}
+\setlength{\headsep}{0.0truein}
+\setlength{\headheight}{0.0truein}
+\setlength{\topskip}{0pt}
+%% End of margins
+
+\usepackage{subfigure}
+
+%%\pagestyle{myheadings}
+%%\markboth{$Date$\hfil$Revision$}{\thepage}
+\usepackage[pdftex,
+bookmarks,
+bookmarksopen,
+pdfauthor={Dongjun Chung, Pei Fen Kuan and Sunduz Keles},
+pdftitle={mosaics Vignette}]
+{hyperref}
+
+\title{Analysis of ChIP-seq Data with `\texttt{mosaics}' Package}
+\author{Dongjun Chung$^1$, Pei Fen Kuan$^2$ and S\"und\"uz Kele\c{s}$^{1,3}$\\
+  $^1$Department of Statistics, University of Wisconsin\\
+  Madison, WI 53706.\\
+  $^2$Department of Biostatistics, University of North Carolina at Chapel Hill\\
+  Chapel Hill, NC 27599.\\
+  $^3$Department of Biostatistics and Medical Informatics,
+  University of Wisconsin\\Madison, WI 53706.}
+
+\date{\today}
+
+\SweaveOpts{engine=R, echo=TRUE, pdf=TRUE}
+
+\begin{document}
+%\VignetteIndexEntry{MOSAiCS}
+%\VignetteKeywords{MOSAiCS}
+%\VignettePackage{mosaics}
+\maketitle
+
+\tableofcontents
+
+\section{Overview}
+
+This vignette provides an introduction to the analysis of ChIP-seq data with
+the `\texttt{mosaics}' package. R package \texttt{mosaics} implements MOSAiCS,
+a statistical framework for the analysis of ChIP-seq data, proposed
+in \cite{mosaics}. MOSAiCS stands for ``\textbf{MO}del-based one and
+two \textbf{S}ample \textbf{A}nalysis and \textbf{I}nference for
+\textbf{C}hIP-\textbf{S}eq Data''. It implements a flexible parametric mixture
+modeling approach for detecting peaks, i.e., enriched regions, in one-sample
+(ChIP sample) or two-sample (ChIP and control samples) ChIP-seq data.
+It can account for mappability and GC content biases that arise in ChIP-seq data.
+
+The package can be loaded with the command:
+
+<<preliminaries,echo=FALSE,results=hide>>=
+options(prompt = "R> ")
+@
+
+<<mosaics-prelim>>=
+library("mosaics")
+@
+
+\section{Getting started}
+
+`\texttt{mosaics}' package provides flexible framework for the ChIP-seq analysis.
+
+If you have the data for matched control sample,
+two-sample analysis is recommended.
+If the ChIP-seq data is deeply sequenced,
+the two-sample analysis without mappability and GC content
+(Section \ref{two_sample_no_MGC}) is usually appropriate.
+For the ChIP-seq data with low sequencing depth,
+the two-sample analysis with mappability and GC content
+(Section \ref{two_sample_MGC}) can be useful.
+When control sample is not available, `\texttt{mosaics}' package accommodates
+one-sample analysis of ChIP-seq data.
+In this case, you should have files for mappability and GC content,
+in addition to the files for ChIP and matched control samples.
+
+We recommend users start from Section \ref{mosaicsRunAll} and
+it discusses the most convenient way to do the two-sample analysis
+(without using mappability and GC content).
+Section \ref{two_sample_no_MGC} discusses each step of the two-sample workflow in detail
+and provides command lines for each step.
+Sections \ref{two_sample_MGC} and \ref{one_sample} briefly explain
+the workflow and command lines for the two-sample analysis and the one-sample analysis
+with mappability and GC content, respectively.
+
+We encourage  questions or requests regarding
+`\texttt{mosaics}' package to be posted on our Google group
+\url{http://groups.google.com/group/mosaics_user_group}.
+
+\section{Two-Sample Analysis using \texttt{'mosaicsRunAll'}}\label{mosaicsRunAll}
+
+Two-sample analysis without mappability and GC content can be done
+in a more convenient way, with the command:
+
+<<mosaicsRunAll,eval=FALSE>>=
+     mosaicsRunAll(
+         chipFile="/scratch/eland/STAT1_ChIP_eland_results.txt",
+         chipFileFormat="eland_result",
+         controlFile="/scratch/eland/STAT1_control_eland_results.txt",
+         controlFileFormat="eland_result",
+         binfileDir="/scratch/bin/",
+         peakFile=c("/scratch/peak/STAT1_peak_list.bed",
+                    "/scratch/peak/STAT1_peak_list.gff"),
+         peakFileFormat=c("bed", "gff"),
+         reportSummary=TRUE,
+         summaryFile="/scratch/reports/mosaics_summary.txt",
+         reportExploratory=TRUE,
+         exploratoryFile="/scratch/reports/mosaics_exploratory.pdf",
+         reportGOF=TRUE,
+         gofFile="/scratch/reports/mosaics_GOF.pdf",
+         byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr="chrM",
+         PET=FALSE, FDR=0.05, fragLen=200, binSize=fragLen, capping=0,
+         bgEst="automatic", signalModel="BIC", parallel=TRUE, nCore=8 )
+@
+
+`\texttt{mosaicsRunAll}' method imports aligned read files, converts them to bin-level files (generated bin-level files will be saved in the directory
+specified in `\texttt{binfileDir}' argument for future use),
+fits the MOSAiCS model, identifies peaks, and exports the peak list.
+In addition, users can also make `\texttt{mosaicsRunAll}' method generate diverse analysis reports,
+such as summary report of parameters and analysis results, exploratory plots, and goodness of fit (GOF) plots.
+Arguments of `\texttt{mosaicsRunAll}' method are summarized in Table \ref{tab:mosaicsRunAll}.
+See Section \ref{constructBins} for
+details of the arguments `\texttt{chipFileFormat}', `\texttt{controlFileFormat}',
+`\texttt{byChr}', `\texttt{useChrfile}', `\texttt{chrfile}', `\texttt{excludeChr}',
+`\texttt{PET}', `\texttt{fragLen}', `\texttt{binSize}', and `\texttt{capping}'.
+See Section \ref{mosaicsFit}' for details of the argument `\texttt{bgEst}'.
+See Section \ref{mosaicsPeak}' for details of the arguments
+`\texttt{FDR}', `\texttt{signalModel}', `\texttt{peakFileFormat}',
+`\texttt{maxgap}', `\texttt{minsize}', and `\texttt{thres}'.
+
+\begin{table}
+\caption{ \label{tab:mosaicsRunAll} \textbf{Summary of the arguments of `\texttt{mosaicsRunAll}' method.} }
+\label{t:case}
+\begin{center}
+\begin{tabular}{ll}
+\multicolumn{2}{c}{(a) Input and output files} \\
+\hline
+Argument & Explanation \\
+\hline
+\texttt{chipFile} & Name of aligned read file of ChIP sample.\\
+\texttt{chipFileFormat} & File format of aligned read file of ChIP sample.\\
+\texttt{controlFile} & Name of aligned read file of matched control sample.\\
+\texttt{controlFileFormat} & File format of aligned read file of matched control sample.\\
+\texttt{binfileDir} & Directory that bin-level files are exported to.\\
+\texttt{peakFile} & Vector of file names of peak list.\\
+\texttt{peakFileFormat} & Vector of file formats of peak list.\\
+\hline
+\multicolumn{2}{c}{} \\
+\multicolumn{2}{c}{(b) Reports} \\
+\hline
+Argument & Explanation \\
+\hline
+\texttt{reportSummary} * & Generate analysis summary? \\
+\texttt{summaryFileName} & File name of analysis summary. \\
+\texttt{reportExploratory} * & Generate exploratory plots? \\
+\texttt{exploratoryFileName} & File name of exploratory plots. \\
+\texttt{reportGOF} * & Generate GOF plots? \\
+\texttt{gofFileName} & File name of GOF plots. \\
+\hline
+\multicolumn{2}{l}{* Reports will be generated only when these arguments are \texttt{TRUE}. Default is \texttt{FALSE}.} \\
+\multicolumn{2}{c}{} \\
+\multicolumn{2}{c}{(c) Tuning parameters} \\
+\hline
+Argument & Explanation \\
+\hline
+\texttt{byChr} & Genome-wide analysis (\texttt{FALSE}) or chromosome-wise analysis (\texttt{TRUE})? \\
+\texttt{useChrfile} & Use the file containing chromosome ID and chromosome size? \\
+\texttt{chrfile} & Name of the file containing chromosome ID and chromosome size. \\
+\texttt{excludeChr} & Vector of chromosomes to be excluded from the analysis. \\
+\texttt{fragLen} & Average fragment length. \\
+\texttt{binSize} & Bin size. \\
+\texttt{capping} & Cap read counts in aligned read files? \\
+\texttt{bgEst} & Background estimation approach. \\
+\texttt{signalModel} & Signal model. \\
+\texttt{PET} & Paired-end tag (\texttt{TRUE}) or single-end tag (\texttt{FALSE}) ChIP-Seq data? \\
+\texttt{FDR} & False discovery rate (FDR). \\
+\texttt{maxgap} & Distance between initial peaks for merging. \\
+\texttt{minsize} & Minimum width to be called as a peak. \\
+\texttt{thres} & Minimum ChIP tag counts to be called as a peak. \\
+\texttt{parallel} & Use \texttt{parallel} package for parallel computing? \\
+\texttt{nCore} & Number of CPUs used for parallel computing. ** \\
+\hline
+\multicolumn{2}{l}{** Relevant only when \texttt{parallel=TRUE} and \texttt{parallel} package is installed.} \\
+\end{tabular}
+\end{center}
+\end{table}
+
+\section{Workflow: Two-Sample Analysis}\label{two_sample_no_MGC}
+
+\subsection{Constructing Bin-Level Files from the Aligned Read File}\label{constructBins}
+
+R package `\texttt{mosaics}' analyzes the data after converting aligned read files
+into bin-level files for modeling and visualization purposes.
+These bin-level data can easily be generated from the aligned read files with the command:
+
+<<constructBins,eval=FALSE>>=
+constructBins( infile="/scratch/eland/STAT1_eland_results.txt",
+    fileFormat="eland_result", outfileLoc="/scratch/eland/",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr="chrM",
+    PET=FALSE, fragLen=200, binSize=200, capping=0 )
+@
+
+You can specify the name and file format of the aligned read file
+in `\texttt{infile}' and `\texttt{fileFormat}' arguments, respectively.
+The `\texttt{PET}' argument indicates whether the file is paired-end tag (`\texttt{PET=TRUE}') or single-end tag (`\texttt{PET=FALSE}') ChIP-Seq data.
+Allowed aligned read file formats depend on the `\texttt{PET}' argument.
+`\texttt{constructBins}' method currently allows the following aligned read file formats for SET ChIP-Seq data:
+Eland result (`\texttt{"eland}$\_$\texttt{result"}'), Eland extended (`\texttt{"eland}$\_$\texttt{extended"}'),
+Eland export (`\texttt{"eland}$\_$\texttt{export"}'), default Bowtie (`\texttt{"bowtie"}'), SAM (`\texttt{"sam"}'), BED (`\texttt{"bed"}'), and CSEM BED (`\texttt{"csem"}').
+%This method assumes that these aligned read files are obtained from single-end tag (SET) experiments.
+For PET ChIP-Seq data, `\texttt{constructBins}' method currently accepts the
+Eland result (`\texttt{"eland}$\_$\texttt{result"}') and SAM (`\texttt{"sam"}') file formats.
+If input file format is neither BED nor CSEM BED,
+it retains only reads mapping uniquely to the reference genome (uni-reads).
+See Appendices \ref{example_aligned_SET} and \ref{example_aligned_PET} for example lines of each aligned read file format.
+
+Even though `\texttt{constructBins}' retains only uni-reads for most aligned read file formats,
+reads mapping to multiple locations on the reference genome (multi-reads)
+can be easily incorporated into bin-level files by utilizing our multi-read allocator,
+\texttt{CSEM} (ChIP-Seq multi-read allocator using Expectation-Maximization algorithm).
+Galaxy tool for CSEM is available in Galaxy Tool Shed (\url{http://toolshed.g2.bx.psu.edu/}; ``csem'' under ``Next Gen Mappers''). Stand-alone version of CSEM is also available at \url{http://www.stat.wisc.edu/~keles/Software/multi-reads/}.
+CSEM exports uni-reads and allocated multi-reads into standard BED file
+and the corresponding bin-level files can be constructed
+by applying `\texttt{constructBins}' method to this BED file with the argument
+`\texttt{fileFormat="csem"}'.
+
+`\texttt{constructBins}' can generate a single bin-level file containing all chromosomes
+(for a genome-wide analysis) or multiple bin-level files for each chromosome
+(for a chromosome-wise analysis).
+If `\texttt{byChr=FALSE}', bin-level data for all chromosomes are exported to one file
+named as `\texttt{[infileName]}$\_$\texttt{fragL[fragLen]}$\_$\texttt{bin[binSize].txt}' (for SET data)
+or `\texttt{[infileName]}$\_$\texttt{bin[binSize].txt}' (for PET data),
+where \texttt{[infileName]}, \texttt{[fragLen]}, and \texttt{[binSize]} are
+name of aligned read file, average fragment length, and bin size, respectively.
+If `\texttt{byChr=TRUE}', bin-level data for each chromosome is exported
+to a separate file named as `\texttt{[infileName]}$\_$\texttt{fragL[fragLen]}$\_$\texttt{bin[binSize]}$\_$\texttt{[chrID].txt}' (for SET data) or `\texttt{[infileName]}$\_$\texttt{bin[binSize]}$\_$\texttt{[chrID].txt}' (for PET data),
+where \texttt{[chrID]} is chromosome ID that reads align to.
+These chromosome IDs (\texttt{[chrID]}) are extracted from the aligned read file.
+The constructed bin-level files are exported to the directory
+specified in `\texttt{outfileLoc}' argument.
+
+The `\texttt{useChrfile}' argument indicates whether to use the file containing chromosome ID and chromosome size, which is specified in the `\texttt{chrfile}' argument.
+See Appendix \ref{example_chrfile} for the example lines of this chromosome information file.
+If you want to exclude some chromosomes in the processed bin-level files,
+you can specify these chromosomes in `\texttt{excludeChr}' argument.
+The `\texttt{excludeChr}' argument will be ignored if `\texttt{useChrfile=TRUE}'.
+
+You can specify average fragment length and bin size in `\texttt{fragLen}' and `\texttt{binSize}' arguments, respectively,
+and these arguments control the resolution of bin-level ChIP-seq data.
+By default, average fragment length is set to 200 bp, which is the common fragment length
+for Illumina sequences, and bin size equals to average fragment length.
+The `\texttt{fragLen}' argument is ignored for PET ChIP-seq data (`\texttt{PET = TRUE}').
+`\texttt{capping}' argument indicates maximum number of reads allowed to start at each nucleotide position.
+Using some small value for capping (e.g., `\texttt{capping=3}') will exclude extremely large read counts that might correspond to PCR amplification artifacts, which is especially useful for the ChIP-seq data with low sequencing depth.
+Capping is not applied (default) if `\texttt{capping}' is set to some non-positive value, e.g., `\texttt{capping=0}'.
+
+\subsection{Reading Bin-Level Data into the R Environment}\label{readBins}
+
+You now have bin-level ChIP data and matched control sample data
+from '\texttt{constructBins}'.
+In this vignette, we use chromosome 21 data from a ChIP-seq
+experiment of STAT1 binding in interferon-$\gamma$-stimulated HeLa S3 cells
+\cite{stat1}. `\texttt{mosaicsExample}' package provides this example dataset.
+
+<<mosaicsExample-prelim>>=
+library(mosaicsExample)
+@
+
+Bin-level data can be imported to the R environment with the command:
+
+<<io-readbin>>=
+exampleBinData <- readBins( type=c("chip","input"),
+    fileName=c( system.file( file.path("extdata","chip_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","input_chr21.txt"), package="mosaicsExample") ) )
+@
+
+For the  `\texttt{type}' argument,
+\texttt{"chip"} and \texttt{"input"} indicate bin-level ChIP data
+control sample data, respectively. You need to specify the corresponding file names
+in `\texttt{fileName}'. `\texttt{mosaics}' package assumes that each file name
+in `\texttt{fileName}' is provided in the same order as in `\texttt{type}'.
+
+In \texttt{mosaics} package, you can do either genome-wide analysis or chromosome-wise analysis
+and this analysis type will be determined automatically
+based on the contents of bin-level files imported using `\texttt{readBins}'.
+If the bin-level files contain more than one chromosome
+(i.e., bin-level files are obtained using `\texttt{byChr=FALSE}' in `\texttt{constructBins}'),
+`\texttt{mosaicsFit}' will analyze all the chromosomes simultaneously (genome-wide analysis).
+Note that if these bin-level files contain different sets of chromosomes,
+then `\texttt{readBins}' method will utilize only the intersection of them.
+If bin-level files are obtained using `\texttt{byChr=FALSE}' in `\texttt{constructBins}',
+each bin-level file contains data for only one chromosome
+and each of these bin-level files need to be analyzed separately (chromosome-wise analysis).
+The genome-wide analysis usually provide more stable model fitting and peak identification results
+so it is recommended for most cases.
+
+R package \texttt{mosaics} provides functions for generating simple summaries
+of the data. The following command prints out basic information about the
+bin-level data, such as number of bins and total ``effective tag counts''.
+``Total effective tag counts'' is defined as the sum of the ChIP tag counts of all
+bins. This value is usually larger than the sequencing depth since tags are
+counted after extension to average fragment length and an extended fragment can
+contribute to multiple bins.
+
+<<io-bindata-show>>=
+exampleBinData
+@
+
+`\texttt{print}' method returns the bin-level data in data frame format.
+
+<<io-bindata-print>>=
+print(exampleBinData)[51680:51690,]
+@
+
+`\texttt{plot}' method provides exploratory plots for the ChIP data. Different
+type of plots can be obtained by varying the `\texttt{plotType}' argument.
+`\texttt{plotType="input"}' generates a plot of mean ChIP tag counts versus
+control tag counts. If `\texttt{plotType}' is not specified, this method plots the
+histogram of ChIP tag counts.
+
+<<io-bindata-plot,eval=FALSE>>=
+plot( exampleBinData )
+plot( exampleBinData, plotType="input" )
+@
+
+Figures \ref{fig:bindata-plot-hist} and \ref{fig:bindata-plot-input} display
+examples of different types of plots. The relationship
+between mean ChIP tag counts and control tag counts seems to be linear,
+especially for small  control tag counts (Figure \ref{fig:bindata-plot-input}).
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-hist,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot(exampleBinData)
+@
+\caption{\label{fig:bindata-plot-hist} Histograms of the  count data
+from ChIP and control samples.}
+\end{center}
+\end{figure}
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-input,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot( exampleBinData, plotType="input" )
+@
+\caption{\label{fig:bindata-plot-input} Mean ChIP tag count versus
+Control tag count.}
+\end{center}
+\end{figure}
+
+\subsection{Fitting the MOSAiCS Model}\label{mosaicsFit}
+
+We are now ready to fit a MOSAiCS model using the bin-level data above
+(\texttt{exampleBinData})   with the command:
+
+<<io-mosaicsfit>>=
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO", bgEst="automatic" )
+@
+
+`\texttt{analysisType="IO"}' indicates implementation of the two-sample analysis.
+`\texttt{bgEst}' argument determines background estimation approach.
+`\texttt{bgEst="matchLow"}' estimates background distribution using only bins with low tag counts
+and it is appropriate for the data with relatively low sequencing depth.
+`\texttt{bgEst="rMOM"}' estimates background distribution using robust method of moment (MOM) and
+it is appropriate for the data with relatively high sequencing depth.
+If `\texttt{bgEst="automatic"}' (default), `\texttt{mosaicsFit}' tries its best guess
+for the background estimation approach, based on the data provided.
+If the goodness of fit obtained using `\texttt{bgEst="automatic"}' is not satisfactory,
+we recommend to try `\texttt{bgEst="matchLow"}' and `\texttt{bgEst="rMOM"}'
+and it might improve the model fit.
+
+`\texttt{mosaicsFit}' fits both one-signal-component and two-signal-component
+models. When identifying peaks, you can choose the number of signal components
+to be used for the final model. The optimal choice of the number of signal
+components depends on the characteristics of data. In order to support users
+in the choice of optimal signal model, \texttt{mosaics} package provides
+Bayesian Information Criterion (BIC) values and Goodness of Fit (GOF) plots
+of these signal models.
+
+The following command prints out BIC values of one-signal-component and
+two-signal-component models, with additional information about the parameters
+used in fitting the background (non-enriched) distribution. A lower BIC value
+indicates a better model fit. For this dataset, we conclude that the
+two-signal-component model has a lower BIC and hence it provides a better fit.
+
+<<io-mosaicsfit-show>>=
+exampleFit
+@
+
+`\texttt{plot}' method provides the GOF plot.  This plots allows visual
+comparisons of the fits of the background, one-signal-component, and
+two-signal-component models with the actual data.  Figure
+\ref{fig:mosaicsfit-plot} displays the GOF plot for our dataset and we
+conclude that the two-signal-component model provides a better fit as is also
+supported by its lower BIC value compared to the one-signal component model.
+
+<<io-mosaicsfit-plot,eval=FALSE>>=
+plot(exampleFit)
+@
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-mosaicsfit-plot,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot(exampleFit)
+@
+\caption{\label{fig:mosaicsfit-plot} Goodness of Fit (GOF) plot. Depicted are
+actual data for ChIP and control samples with simulated data from the following
+fitted models: (Sim:N): Background model; (Sim:N+S1): one-signal-component
+model; (Sim:N+S1+S2): two-signal-component model. }
+\end{center}
+\end{figure}
+
+\subsection{Identifying Peaks Based on the Fitted Model}\label{mosaicsPeak}
+
+Using BIC values and GOF plots in the previous section, we concluded that
+two-signal-component model fits our data better. Next, we will identify peaks
+with the two-signal-component model at a false discovery rate (FDR) of 0.05
+using the command:
+
+<<ts-mosaicspeak>>=
+examplePeak <- mosaicsPeak( exampleFit, signalModel="2S", FDR=0.05,
+maxgap=200, minsize=50, thres=10 )
+@
+
+`\texttt{signalModel="2S"}' indicates two-signal-component model. Similarly,
+one-signal-component model can be specified by `\texttt{signalModel="1S"}'.
+FDR can be controlled at the desired level by specifying `\texttt{FDR}' argument.
+In addition to these two essential parameters, you can also control three more
+parameters, `\texttt{maxgap}', `\texttt{minsize}', and `\texttt{thres}'. These
+parameters are for refining initial peaks called using specified signal model
+and FDR.  Initial nearby peaks are merged if the distance (in bp) between them
+is less than `\texttt{maxgap}'. Some initial peaks are removed if their lengths
+are shorter than `\texttt{minsize}' or their ChIP tag counts are less
+than `\texttt{thres}'.
+
+If you use a bin size shorter than the average fragment length in the
+experiment, we recommend to set `\texttt{maxgap}'  to the  average fragment
+length and `\texttt{minsize}'  to the bin size. This setting removes peaks
+that are too narrow (e.g., singletons). If you set the bin size to the average
+fragment length (or maybe bin size is larger than the average fragment length),
+we recommend  setting `\texttt{minsize}'  to a value  smaller than the average
+fragment length while leaving `\texttt{maxgap}' the same as the average
+fragment length. This is to prevent filtering using `\texttt{minsize}' because
+initial peaks would already be at a reasonable width. `\texttt{thres}' is
+employed to filter out initial peaks with very small ChIP tag counts because
+such peaks might be false discoveries. Optimal choice of `\texttt{thres}'
+depends on the sequencing depth of the ChIP-seq data to be analyzed. If you
+don't wish to filter out initial peaks using ChIP tag counts, you can set
+`\texttt{thres}'  to an arbitrary negative value.
+
+The following command prints out a summary of identified peaks including
+the number of  peaks identified, median peak width, and the empirical false
+discovery rate (FDR).
+
+<<io-mosaicspeak-show>>=
+examplePeak
+@
+
+`\texttt{print}' method returns the peak calling results in data frame format.
+This data frame can be used as an input for downstream analysis such as motif
+finding. This output might have different number of columns, depending on
+`\texttt{analysisType}' of `\texttt{mosaicsFit}'. For example, if
+`\texttt{analysisType="TS"}', columns are peak start position, peak end
+position, peak width, averaged posterior probability, minimum posterior
+probability, averaged ChIP tag count, maximum ChIP tag count, averaged control
+tag count, averaged control tag count scaled by sequencing depth, averaged log
+base 2 ratio of ChIP over input tag counts, averaged mappability score, and
+averaged GC content score for each peak. Here, the posterior probability of a
+bin refers to  the probability that the bin is not a peak conditional on data.
+Hence, smaller posterior probabilities provide more evidence that the bin is
+actually a peak.
+
+<<io-mosaicspeak-print>>=
+print(examplePeak)[1:15,]
+@
+
+You can export peak calling results to text files in diverse file formats.
+Currently, `\texttt{mosaics}' package supports TXT, BED, and GFF file formats.
+In the exported file, TXT file format (`\texttt{type="txt"}') includes all the
+columns that `\texttt{print}' method returns. `\texttt{type="bed"}' and
+`\texttt{type="gff"}' export peak calling results in standard BED and GFF file
+formats, respectively, where score is the averaged ChIP tag counts in each
+peak. Peak calling results can be exported in TXT, BED, and GFF file formats,
+respectively, by the commands:
+
+<<io-mosaicspeak-export>>=
+export( examplePeak, type="txt", filename="TSpeakList.txt" )
+export( examplePeak, type="bed", filename="TSpeakList.bed" )
+export( examplePeak, type="gff", filename="TSpeakList.gff" )
+@
+
+`\texttt{fileLoc}' and `\texttt{fileName}' indicate the directory and the name
+of the exported file.
+
+\section{Two-Sample Analysis with Mappability and GC Content}\label{two_sample_MGC}
+
+For the two-sample analysis with mappability and GC content and the one-sample analysis,
+you also need bin-level mappability, GC content, and sequence ambiguity
+score files for the reference genome you are working with.
+If you are working with organisms such as human (HG18 and HG19), mouse (MM9),
+rat (RN4), and Arabidopsis (TAIR9), you can download their corresponding
+preprocessed mappability, GC content, and sequence ambiguity score files
+at \url{http://www.stat.wisc.edu/~keles/Software/mosaics/}.
+If your reference genome of interest is not listed on our website,
+you can inquire about it at our Google group,
+\url{http://groups.google.com/group/mosaics_user_group}, and we would be happy
+to add your genome of interest to the list.
+The companion website also provides
+all the related scripts and easy-to-follow instructions to prepare these files.
+Please check \url{http://www.stat.wisc.edu/~keles/Software/mosaics/}
+for more details.
+
+You can import bin-level data and fit MOSAiCS model
+for the two-sample analysis using mappability and GC content with the commands:
+
+<<ts-readbin>>=
+exampleBinData <- readBins( type=c("chip","input","M","GC","N"),
+    fileName=c( system.file( file.path("extdata","chip_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","input_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","M_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","GC_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","N_chr21.txt"), package="mosaicsExample") ) )
+@
+
+For the  `\texttt{type}' argument, \texttt{"chip"}, \texttt{"input"},
+\texttt{"M"}, \texttt{"GC"}, and \texttt{"N"} indicate bin-level ChIP data,
+control sample data, mappability score, GC content score, and sequence
+ambiguity score, respectively.
+
+When you have mappability and GC contents,
+`\texttt{plot}' method provides additional plot types.
+`\texttt{plotType="M"}' and `\texttt{plotType="GC"}' generate plots of mean
+ChIP tag counts versus mappability and GC content scores, respectively.
+Moreover, `\texttt{plotType="M|input"}' and
+`\texttt{plotType="GC|input"}' generate plots of mean ChIP tag counts versus
+mappability and GC content scores, respectively, conditional on control tag
+counts.
+
+<<ts-bindata-plot>>=
+plot( exampleBinData, plotType="M" )
+plot( exampleBinData, plotType="GC" )
+plot( exampleBinData, plotType="M|input" )
+plot( exampleBinData, plotType="GC|input" )
+@
+
+As discussed in \cite{mosaics},
+we observe that mean ChIP tag count increases as mappability score increases
+(Figure \ref{fig:bindata-plot-M}). Mean ChIP tag count depends on GC score
+in a non-linear fashion (Figure \ref{fig:bindata-plot-GC}).
+When we condition on control tag counts (Figures \ref{fig:bindata-plot-M-input}
+and \ref{fig:bindata-plot-GC-input}), mean ChIP tag count versus mappability
+and GC content relations exhibit similar patterns to that of marginal plots
+given in Figures  \ref{fig:bindata-plot-M} and \ref{fig:bindata-plot-GC}.
+MOSAiCS incorporates this observation by modeling ChIP tag counts from non-peak
+regions with a small number of control tag counts as a function of
+mappability, GC content, and control tag counts.
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-M,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot( exampleBinData, plotType="M" )
+@
+\caption{\label{fig:bindata-plot-M} Mean ChIP tag count versus Mappability.}
+\end{center}
+\end{figure}
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-GC,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot( exampleBinData, plotType="GC" )
+@
+\caption{\label{fig:bindata-plot-GC} Mean ChIP tag count versus GC content.}
+\end{center}
+\end{figure}
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-M-input,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot( exampleBinData, plotType="M|input" )
+@
+\caption{\label{fig:bindata-plot-M-input} Mean ChIP tag count versus
+Mappability, conditional on control tag counts.}
+\end{center}
+\end{figure}
+
+\begin{figure}[tb]
+\begin{center}
+<<fig-bindata-plot-GC-input,fig=TRUE,height=5,width=5,echo=FALSE>>=
+plot( exampleBinData, plotType="GC|input" )
+@
+\caption{\label{fig:bindata-plot-GC-input} Mean ChIP tag count versus
+GC content, conditional on control tag counts.}
+\end{center}
+\end{figure}
+
+Application of MOSAiCS to multiple case studies of ChIP-seq data
+with low sequencing depth showed that consideration of
+mappability and GC content in the model improves sensitivity and specificity
+of peak identification even in the presence of a control sample \cite{mosaics}.
+\texttt{mosaics} package accommodates a two-sample analysis with
+mappability and GC content by specification of `\texttt{analysisType="TS"}'
+when calling the `\texttt{mosaicsFit}' method.
+
+<<ts-mosaicsfit,eval=FALSE>>=
+exampleFit <- mosaicsFit( exampleBinData, analysisType="TS", bgEst="automatic" )
+@
+
+Peak identification can be done exactly in the same way
+as in the case of the two-sample analysis without mappability and GC content.
+
+<<os-mosaicspeak,eval=FALSE>>=
+OneSamplePeak <- mosaicsPeak( OneSampleFit, signalModel="2S", FDR=0.05,
+maxgap=200, minsize=50, thres=10 )
+@
+
+\section{One-Sample Analysis}\label{one_sample}
+
+When control sample is not available, `\texttt{mosaics}' package accommodates
+one-sample analysis of ChIP-seq data. Implementation of the MOSAiCS one-sample
+model is very similar to that  of  the two-sample analysis. Bin-level data for
+the one-sample analysis can be imported to the R environment with the command:
+
+<<os-readbin,eval=FALSE>>=
+exampleBinData <- readBins( type=c("chip","M","GC","N"),
+    fileName=c( system.file( file.path("extdata","chip_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","M_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","GC_chr21.txt"), package="mosaicsExample"),
+    system.file( file.path("extdata","N_chr21.txt"), package="mosaicsExample") ) )
+@
+
+%Note that you don't need to provide `\texttt{"input"}' in `\texttt{type}' and
+%the file name of  a control dataset in `\texttt{fileName}' here.
+In order to fit a MOSAiCS model for the one-sample analysis, you need to specify
+`\texttt{analysisType="OS"}' %instead of `\texttt{analysisType="TS"}'
+when calling  the `\texttt{mosaicsFit}' method.
+
+<<os-mosaicsfit,eval=FALSE>>=
+exampleFit <- mosaicsFit( exampleBinData, analysisType="OS", bgEst="automatic" )
+@
+
+Peak identification can be done exactly in the same way as in the case
+of the two-sample analysis.
+
+<<os-mosaicspeak,eval=FALSE>>=
+exampleFit <- mosaicsPeak( exampleFit, signalModel="2S", FDR=0.05,
+maxgap=200, minsize=50, thres=10 )
+@
+
+\section{Generating Wiggle Files to View on Genome Browsers}\label{generateWig}
+
+R package `\texttt{mosaics}' can generate wiggle files (\url{http://genome.ucsc.edu/goldenPath/help/wiggle.html}) for visualization purposes.
+These wiggle files can be used as input for many genome browsers such as the UCSC genome browser (\url{http://genome.ucsc.edu/}).
+These wiggle files can easily be generated from the aligned read files with the command:
+
+<<generateWig,eval=FALSE>>=
+generateWig( infile="/scratch/eland/STAT1_eland_results.txt",
+    fileFormat="eland_result", outfileLoc="/scratch/eland/",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr="chrM",
+    PET=FALSE, fragLen=200, span=200, capping=0, normConst=1 )
+@
+
+The `\texttt{generateWig}' function has similar arguments as the `\texttt{constructBins}' function,
+except that it has `\texttt{span}' and `\texttt{normConst}' arguments
+instead of the `\texttt{binSize}' argument of the `\texttt{constructBins}' function.
+The `\texttt{generateWig}' function supports the same set of aligned read file formats as in the `\texttt{constructBins}' function.
+The `\texttt{span}' argument indicates span used in wiggle files.
+The `\texttt{normConst}' argument means the normalizing constant to scale the values in wiggle files
+and it is especially useful when wiggle files for multiple related samples are generated and compared.
+In the resulting wiggle files, values are scaled by the value specified in the `\texttt{normConst}' argument.
+
+`\texttt{generateWig}' can generate a single wiggle file containing all chromosomes or multiple wiggle files for each chromosome.
+If `\texttt{byChr=FALSE}', all chromosomes are exported to one file
+named as `\texttt{[infileName]}$\_$\texttt{fragL[fragLen]}$\_$\texttt{span[span].wig}' (for SET data)
+or `\texttt{[infileName]}$\_$\texttt{span[span].wig}' (for PET data),
+where \texttt{[infileName]}, \texttt{[fragLen]}, and \texttt{[span]} are
+name of aligned read file, average fragment length, and span, respectively.
+If `\texttt{byChr=TRUE}', each chromosome is exported to a separate file named as `\texttt{[infileName]}$\_$\texttt{fragL[fragLen]}$\_$\texttt{span[span]}$\_$\texttt{[chrID].wig}' (for SET data) or
+`\texttt{[infileName]}$\_$\texttt{span[span]}$\_$\texttt{[chrID].wig}' (for PET data),
+where \texttt{[chrID]} is chromosome ID that reads align to.
+%These chromosome IDs (\texttt{[chrID]}) are extracted from the aligned read file.
+The constructed wiggle files are exported to the directory
+specified in `\texttt{outfileLoc}' argument.
+
+%\section{Tuning Parameters to Improve the MOSAiCS Fit}\label{tuning}
+\section{Case Studies: Tuning Parameters to Improve the MOSAiCS Fit}\label{case_studies}
+
+Because the \texttt{mosaics} package is based on the statistical modeling approach,
+it is important to check that we have nice model fits.
+Goodness of fit (GOF) plots generated from the `\texttt{plot}' method
+are key tools that users can utilize to check the MOSAiCS fit and improve it if necessary.
+In this section, we will discuss several case studies based on real ChIP-Seq data,
+especially focusing on understanding unsatisfactory MOSAiCS fits and suggesting strategies to improve it.
+The case studies illustrated in this section are based on \cite{bookchapter}
+and we provided deeper discussion and example analysis codes in this book chapter.
+
+%\begin{figure}[ht]
+%\centering
+%\subfigure[]{
+%   \includegraphics[width=0.5\textwidth] {Figure4a.pdf}
+% }
+%\subfigure[]{
+%   \includegraphics[width=0.5\textwidth] {Figure4b.pdf}
+% }
+%\caption{ \label{fig:case_studies_histogram} }
+%\end{figure}
+
+\subsection{Case 1}
+
+\begin{figure}[ht]
+\centering
+\subfigure[When the background estimation approach was automatically chosen]{
+   \includegraphics[width=0.5\textwidth] {GOF_matchLow.pdf}
+ }
+\subfigure[When the background estimation approach was explicitly specified]{
+   \includegraphics[width=0.5\textwidth] {GOF_rMOM.pdf}
+ }
+\caption{ Case \#1. Goodness of fit when the background estimation approach was automatically chosen by \texttt{mosaics} package (a) and explicitly specified as robust method of moment (b). The MOSAiCS fit was significantly improved by explicitly specifying the background estimation approach. \label{fig:case_studies_GOF_no1} }
+\end{figure}
+
+Figure \ref{fig:case_studies_GOF_no1}a shows the GOF plot for the two-sample analysis without using mappability and GC content. The GOF plot indicates that ChIP tag counts simulated from the fitted model are on average higher than the actual ChIP-Seq data. Moreover, estimated background dominates in the fitted model. In such cases (\textit{over-estimation of background}), we usually have much smaller number of peaks. This problem occurred essentially because `\texttt{mosaicsFit}' guessed using bins with low ChIP tag counts (`\texttt{bgEst="matchLow"}') as the optimal background estimation approach, from its automatic selection (`\texttt{bgEst="automatic"}'). Although automatic selection usually works well, there are some cases that it does not result in optimal background estimation approach. In this case, the MOSAiCS fit can be improved by explicitly specifying the background estimation approach with the command:
+
+<<tuning-case1,eval=FALSE>>=
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO", bgEst="rMOM" )
+@
+
+Figure \ref{fig:case_studies_GOF_no1}b shows that the MOSAiCS fit was significantly improved by explicitly using the robust method of moment approach (`\texttt{bgEst="rMOM"}').
+
+\subsection{Case 2}
+
+\begin{figure}[ht]
+\centering
+\subfigure[`\texttt{truncProb = 0.999}' (default)]{
+   \includegraphics[width=0.5\textwidth] {Figure5c.pdf}
+ }
+\subfigure[`\texttt{truncProb = 0.80}']{
+   \includegraphics[width=0.5\textwidth] {Figure5d.pdf}
+ }
+\caption{ Case \#2. Goodness of fit when default `\texttt{truncProb}' value (0.999) is used (a) and `\texttt{truncProb = 0.80}' is explicitly specified (b). The MOSAiCS fit was significantly improved by lowering the `\texttt{truncProb}' value.
+\label{fig:case_studies_GOF_no2} }
+\end{figure}
+
+Figure \ref{fig:case_studies_GOF_no2}a shows the GOF plot for the two-sample analysis without using mappability and GC content. The GOF plot indicates that the MOSAiCS fit is unsatisfactory for the bins with low ChIP tag counts. This problem occurred essentially because background estimation was affected by some bins with high tag counts in the matched control data. In the two-sample analysis without using mappability and GC content, the `\texttt{truncProb}' argument indicates proportion of bins that are not considered as outliers in the control data and it controls the functional form used for the control data. Although our empirical studies indicates that default `\texttt{truncProb}' value usually works well, there are some cases that it does not result in optimal background estimation. In this case, the MOSAiCS fit can be improved by lowering the `\texttt{truncProb}' value with the command:
+
+<<tuning-case2,eval=FALSE>>=
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO",
+	bgEst="automatic", truncProb=0.80 )
+@
+
+Figure \ref{fig:case_studies_GOF_no2}b shows that the MOSAiCS fit was significantly improved by using `\texttt{truncProb = 0.80}'.
+
+\subsection{Case 3}
+
+\begin{figure}[ht]
+\centering
+\subfigure[Two-sample analysis without mappability and GC content]{
+   \includegraphics[width=0.5\textwidth] {Figure5a.pdf}
+ }
+\subfigure[Two-sample analysis with mappability and GC content]{
+   \includegraphics[width=0.5\textwidth] {Figure5b.pdf}
+ }
+\caption{ Case \#3. Goodness of fit for the two-sample analysis without utilizing mappability and GC content (a). The MOSAiCS fit was improved when we use the two-sample analysis utilizing mappability and GC content (b).
+\label{fig:case_studies_GOF_no3} }
+\end{figure}
+
+Figure \ref{fig:case_studies_GOF_no3}a displays the GOF plot for the two-sample analysis without utilizing mappability and GC content. Neither the blue (two-signal component) nor the red (one-signal component) curve provides good fit to the ChIP data. In this case, we consider fitting a two-sample analysis with mappability and GC content. Figure \ref{fig:case_studies_GOF_no3}b displays the GOF plot for the two-sample analysis with mappability and GC content in addition to Input and the goodness of fit improves significantly by utilizing mappability and GC content.
+
+\subsection{Note on Parameter Tuning}
+
+Overall, unsatisfactory model fits can be improved via the tuning parameters in the `\texttt{mosaicsFit}' function. Our empirical studies suggest that as the sequencing depths are getting larger, genomic features mappability and GC content have less of an impact on the overall model fit. In particular, we suggest tuning the the two-sample model without mappability and GC content in the cases of unsatisfactory fits before switching to a fit with mappability and GC content. The following are some general tuning suggestions: for the two-sample analysis without utilizing mappability and GC content, lowering the value of the `\texttt{truncProb}' parameter; for the two-sample analysis with mappability and GC content, a larger `\texttt{s}' parameter and a smaller `\texttt{meanThres}' parameter; for the one-sample analysis with mappability and GC content, varying the `\texttt{meanThres}' parameter are the general tuning guidelines. Although MOSAiCS has two additional tunable parameters, `\texttt{k}' and `\texttt{d}', we have accumulated ample empirical evidence through analyzing many datasets that the default values of `\texttt{k = 3}' and `\texttt{d = 0.25}' work well for these parameters \cite{bookchapter}. If you encounter a fitting problem you need help with, feel free to contact us at our Google group, \url{http://groups.google.com/group/mosaics_user_group}.
+
+%In the two-sample analysis using mappability and GC content, users can control three tuning parameters: `\texttt{s}', `\texttt{d}', and `\texttt{meanThres}'.  `\texttt{s}' and `\texttt{d}' are parameters of the background distribution and control the functional form used for the control data. Please see \cite{mosaics} for further details on these two model parameters. `\texttt{meanThres}' controls the number of strata used at the robust linear regression modeling step of the background distribution fitting. `\texttt{mosaics}' package uses the following parameter setting as default:
+
+%<<tuning-ts,eval=FALSE>>=
+%exampleFit <- mosaicsFit( exampleBinData, analysisType="TS",
+%	bgEst="automatic", meanThres=1, s=2, d=0.25 )
+%@
+
+%Users might need to consider parameter tuning especially when the fitted background model is too similar to the actual data, resulting in too few peaks. If such cases are detected or predicted, `\texttt{mosaicsFit}' prints out warning or error messages. You may also be able to detect this case using the GOF plot. Using a higher `\texttt{s}' value and lower `\texttt{meanThres}' often solves the problem, e.g., `\texttt{s = 6}' and `\texttt{meanThres = 0}'.
+
+%In addition to `\texttt{analysisType}', `\texttt{mosaicsFit}' method provides parameters to tune the background distribution of the MOSAiCS model. We specified appropriate default values for these parameters based on computational experiments and analysis of diverse ChIP-seq datasets. Default values work well in general but some tuning might be required for some cases. You may need to consider parameter tuning if the fitted background model is too similar to the actual data in the GOF plot or you encounter some warning or error messages while running `\texttt{mosaicsFit}' method. Section \ref{tuning} provides basic guidelines on parameter tuning. If you encounter a fitting problem you need help with, feel free to contact us at our Google group, \url{http://groups.google.com/group/mosaics_user_group}.
+
+\section{Conclusion and Ongoing Work}\label{conclusion}
+
+R package \texttt{mosaics} provides effective tools to read and investigate
+ChIP-seq data, fit MOSAiCS model, and identify peaks. We are continuously
+working on improving \texttt{mosaics} package further, especially in supporting
+more diverse genomes, automating fitting procedures, developing more friendly
+and easy-to-use user interface, and providing more effective data investigation
+tools. Please post any questions or requests regarding `\texttt{mosaics}'
+package at \url{http://groups.google.com/group/mosaics_user_group}. Updates and
+changes of `\texttt{mosaics}' package will be announced at our Google group and
+the companion website (\url{http://www.stat.wisc.edu/~keles/Software/mosaics/}).
+
+\section*{Acknowledgements}
+
+We thank Gasch, Svaren, Chang, Kiley, Bresnick, Pike, and Donohue Labs at the University of Wisconsin-Madison for sharing their data for MOSAiCS analysis and useful discussions. We also thank Colin Dewey and Bo Li for the CSEM output in Appendix A.7.
+
+\begin{thebibliography}{99}
+\bibitem{mosaics} Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Kele\c{s}
+(2010), ``A Statistical Framework for the Analysis of ChIP-Seq Data'',
+\textit{Journal of the American Statistical Association}, 106, 891-903.
+\bibitem{stat1} Rozowsky, J, G Euskirchen, R Auerbach, D Zhang, T Gibson,
+R Bjornson, N Carriero, M Snyder, and M Gerstein (2009), ``PeakSeq enables
+systematic scoring of ChIP-Seq experiments relative to controls'',
+\textit{Nature Biotechnology}, 27, 66-75.
+\bibitem{bookchapter} Sun, G, D Chung, K Liang, S Kele\c{s} (2012),
+``Statistical Analysis of ChIP-Seq Data with MOSAiCS'' (submitted).
+\end{thebibliography}
+
+\appendix
+
+\clearpage
+
+\section{Appendix: Example Lines of Aligned Read Files for SET ChIP-Seq Data}\label{example_aligned_SET}
+
+\subsection{Eland Result File Format}
+
+\begin{verbatim}
+>HWUSI-EAS787_0001:6:1:2020:1312#NTAGGC/1	TTGCGGGTTAACAAAAACGATGTAAACCATCG	U0	1	0	0	U00096	3894480	F	..
+>HWUSI-EAS787_0001:6:1:2960:1312#NTAGGC/1	TATTTTCCTGGTTCTGCTGCTGGTGAGCGGCG	U0	1	0	0	U00096	4079496	R	..
+>HWUSI-EAS787_0001:6:1:3045:1310#NTAGGC/1	AGCGTATCAAAACTGACAATTCATTCTATGAA	U0	1	0	0	U00096	2087194	F	..
+>HWUSI-EAS787_0001:6:1:3887:1310#NTAGGC/1	TCCGAGTTGTCGCAATGGGGGCAAGGTGTGAA	U1	0	1	0	U00096	2405216	R	..	26C
+>HWUSI-EAS787_0001:6:1:3993:1315#TTAGGC/1	AGACATAGTAAAACCGATACCGCACAGGATCC	U0	1	0	0	U00096	18483	R	..
+>HWUSI-EAS787_0001:6:1:4156:1312#NTAGGC/1	GCCGAACTGACAAATAAAATAGCCTGATAGGA	U0	1	0	0	U00096	2687648	F	..
+>HWUSI-EAS787_0001:6:1:4215:1310#NTAGGC/1	GGAATTTCACGAAAACAGAGCTAAAGCGCCGT	U0	1	0	0	U00096	4569776	F	..
+>HWUSI-EAS787_0001:6:1:4458:1311#NTAGGC/1	GTTCCTGGCAGGGATTCGGCGCACTCGCCCAC	U0	1	0	0	U00096	3607517	F	..
+>HWUSI-EAS787_0001:6:1:4841:1308#NTAGGC/1	GGCGTGTGCATCAGCAGAATTTTTCGGGCGGT	U0	1	0	0	U00096	3715588	R	..
+>HWUSI-EAS787_0001:6:1:5215:1316#TTAGGC/1	TGATATTCTCCGCAGCCAGACTTTTTCCGCCA	U0	1	0	0	U00096	668432	R	..
+\end{verbatim}
+
+\subsection{Eland Extended File Format}
+
+\begin{verbatim}
+>SOLEXA2_1110:7:1:1599:1010#0/1	NNTTTATTTCAACAGGATACTAATATCCTAGGCCCTTTTC	1:0:0	chr21.fa:26418321RCT38
+>SOLEXA2_1110:7:1:2734:1009#0/1	NNCCCCTCCAATGACTATAGCAATGATCTTCATGGTAGAT	1:0:0	chr18.fa:37353206RTC38
+>SOLEXA2_1110:7:1:3130:1007#0/1	NNTCTTATATTCTTACGAAGCATCAGTCTTAGCAGAAACT	1:0:0	chr4.fa:141242065RCC38
+>SOLEXA2_1110:7:1:3709:1009#0/1	NNAAAAAATACAAAAATTAGGCTGTGCACGGTGGTGCAGG	0:1:0	chr18.fa:12860297FCA25G6CT2T1
+>SOLEXA2_1110:7:1:4887:1009#0/1	NNTTATATAACAATTCAACAACTATTTCTTGAGCATCTAA	1:0:0	chr13.fa:60908544RAA38
+>SOLEXA2_1110:7:1:6298:1011#0/1	NNAGACACACATTTGTTTTTTGTATTTTTCACATCTCTTT	1:0:0	chr11.fa:47506195FAA38
+>SOLEXA2_1110:7:1:7798:1011#0/1	NNAGACACTGAGATTTTATGATACTGATTATTGTCGCATT	1:0:0	chr11.fa:25291690FAC38
+>SOLEXA2_1110:7:1:7970:1007#0/1	NNGGCTCCTAAAAAGCATACTTTTTTTTTGTTTTCTGTAT	0:0:1	chr21.fa:19842628RGA27^T$11
+>SOLEXA2_1110:7:1:8051:1007#0/1	NNTCACATTACCTTTACTAATCCTGAGAGAGCAGGAACTC	1:0:0	chr7.fa:47191653FCA38
+>SOLEXA2_1110:7:1:8422:1011#0/1	NNGGGCTTTTGGGGATGATGGTGTCACCAGGTGGGGAATT	1:0:0	chr1.fa:59480447RAA38
+\end{verbatim}
+
+\subsection{Eland Export File Format}
+
+\begin{verbatim}
+AMELIA	102	5	1	7564	6792	...T..	1	TCTCATTTTTTACGTTTTTCAGTGATTTCGTCATTTTTCAAGTCGTCAAG	ggggggggggggggggggggggeggggggggegggggeeggggggggggg	0:48:71											Y
+AMELIA	102	5	1	7589	6796	...A..	1	TGTGCATTTCACATTTTTCACGTTTTTTAGTGATTTCGTCATTTTTCAAG	ggggggggggggggggggggggggggggfgeggggggggggggggggggg	chr2.fa		98506985	R	50	40						Y
+AMELIA	102	5	1	7721	6804	...G..	1	TATGAGACAGTGGATATGTTATGTAATATCTCCAAGTCAACTTCTTTTCA	cca`caabbceceeaca\ccbccZc[[_\^a_\cbdcdc_YWRPXa^BBB	chrX.fa		18454546	F	50	190						Y
+AMELIA	102	5	1	7516	6810	...C..	1	GGGACAGTTCTGCAGACTTGGGACTCCACTCTAGAGAGGAAGAAAGAGAG	RUQKUV[\]S_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB	chr1.fa		101923817	F	25A12A11	29						N
+AMELIA	102	5	1	7650	6808	...C..	1	AGAGGTGCCTGCACCTCCAGCCACCTGGGAGGCTGTGTCAGGAGGATCAC	dddddaeffegggggegggfegggggggggabcdegg^eged`bdb]_cb	chr12.fa		80471182	F	50	203						Y
+AMELIA	102	5	1	7703	6811	...C..	1	CTTTTAGATAACAATCCCATATTCAGGAAGTTTTATTCAATTCATTCAAG	gggfeggggffffgggggfgfgdgffgggfddfeddfff^gfgagggagg	chr4.fa		17871000	F	50	203						Y
+AMELIA	102	5	1	7524	6826	...A..	1	ACTTCGCCTTACTGTAAGTGTATCTCACGCATGCAGAGCTCAGCAGCGGT	BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB	NM											N
+AMELIA	102	5	1	7600	6825	...T..	1	GATATATAAATATTTATTATATGTAAAACACGTATTTTAAAGAACTTAAA	gggggfggfggfgffgfggfddeddgggdgfdaeeggaeeffdgegbgge	chr15.fa		57197391	R	50	203						Y
+AMELIA	102	5	1	7564	6837	...A..	1	TTCGGCTTGGCAGCAAGCGCCAACTACCCCCTAAGCCAGCTTTCGAGCCT	bbbbbbbbK^BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB	chr1.fa		129403054	R	29A20	27						N
+AMELIA	102	5	1	7620	6840	...A..	1	CTTGACGACTTGAAAAATGACGAATTCACTAAAATACGTGAAAAAAGAGA	fggffgfdggeeffdfdffdedfgggfeeebddddfdfebbdd_dgggge	0:4:13											Y
+\end{verbatim}
+
+\subsection{Default Bowtie File Format}
+
+\begin{verbatim}
+HWI-EAS255_8232_FC30D82_6_1_11_1558	-	chr7	82587091	ATCTTGTTTTATCTTGAAAAGCAGCTTCCTTAAAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_12_793	+	chr4	33256683	GTTAATCGGGAAAAAAACCTTTTCCAGGAAAACAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_17_606	-	chr12	92751594	GCCTACATCCTCAGATTCATACATGTCACCATTTC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_29_1543	+	chr20	53004497	GTGACCTAGTTAAATACTGGCTCCACCTCTTGGAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_22_327	+	chr8	93861371	GAAATAGCAGGAGACAGGTACTTTATCTTGCTTTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_10_1544	-	chr4	121630802	TTCCCTAACATCTGTGTCAATTCTCTGTCAACTGC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_108_1797	+	chr8	131141865	GTTGAATGAAATGGATATGAAACAAAGCACTATAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+HWI-EAS255_8232_FC30D82_6_1_41_1581	-	chr7	16554954	CTCTTGCTCTCTTCATTAGTTTAGTTTTCTTCTAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0	22:G>T
+HWI-EAS255_8232_FC30D82_6_1_12_787	-	chr8	119427380	CCCTAGAGGAGCTCAAAACAACTGCACAACACAAC	IIIEIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0	23:T>C
+HWI-EAS255_8232_FC30D82_6_1_6_1497	+	chr1	233856805	GTTAAAGGCGTTTGGATATGATGCAATTCCAAATC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0
+\end{verbatim}
+
+\subsection{SAM File Format}
+
+\begin{verbatim}
+SALLY:187:B02JLABXX:5:1101:1418:1923	65	U00096	1943409	30	51M	*	0	0	GTTCGCGAATTAAAGTTTCACTGCTGGCGTCAGGGCGAGCGATTTTGCTCA	#1:DB??@F?DFFB@<E?CGCHIICHG<G?FH0B?76@FG=8?;;>DA@C>	NM:i:1	MD:Z:0C50
+SALLY:187:B02JLABXX:5:1101:1441:1969	65	U00096	3310343	30	51M	*	0	0	GCGGACGCGCTTCACGCGGAACTTCAATGCCCTGACGCGCATATTCGTACA	#1=BDBDDHA?D?ECCGDE:7@BGCEI@CGGGICGHE6BABBBCDCC>A?@	NM:i:1	MD:Z:0T50
+SALLY:187:B02JLABXX:5:1101:1685:1925	65	U00096	2164008	30	51M	*	0	0	GGCACTTCCCTGAAATGCCGATCCACCTTTCGGTGCAGGCTAACGCCGTGA	#1=DFFFFHHHGHIIIIIIIEHGIIIIIIIIIIFCHIIIIIHIGGGHH@BH	NM:i:1	MD:Z:0A50
+SALLY:187:B02JLABXX:5:1101:1587:1933	65	U00096	2140205	30	51M	*	0	0	GGCCAGGCTTCAATATCTCGGTCACACAGACGCATGGCATTTTCTCCTTTC	#1=DDFFFHHHHGIJIIJJJJHIGIJJJJJJIJJJJJHIJJJJJJJJJJJJ	NM:i:1	MD:Z:0A50
+SALLY:187:B02JLABXX:5:1101:1635:1986	81	U00096	1432777	30	51M	*	0	0	AACGTGCTGCGGTTGGTTTGACTTTGATTGAGACGTTTTGGAATTTTTTTC	HEJJJJJJJJIJJGJJJJJIHJJJJJJIGJJIHJJJJJHHHHHDFFFD=1#	NM:i:0	MD:Z:51
+SALLY:187:B02JLABXX:5:1101:1607:2000	65	U00096	932738	30	51M	*	0	0	GGGCGTCATCAGTCCAGCGACGAATACATTGATTATTTTGCCGTTTCGCTA	#1=BDD?DFFFFFI<AE>GIFFBGFIIFGIBFIIFIEIIIFEF;ADFFAEI	NM:i:1	MD:Z:0T50
+SALLY:187:B02JLABXX:5:1101:1937:1904	81	U00096	4541497	30	51M	*	0	0	AAACGTTGGTGTGCAGATCCTGGACAGAACGGGTGCTGCGCTGACGCTGGC	JJIJJJJJJJJJJJIJJJJJIHHIGIIJJJJJIGIJJJHHHHHFFFDD=4#	NM:i:1	MD:Z:50A0
+SALLY:187:B02JLABXX:5:1101:1881:1942	65	U00096	4003009	30	51M	*	0	0	GGCTGCAGGAGCATGACAATCCGTTCACGCTCTATCCTTATGACACCAACT	#1:BDFA>FDHBHHIGHICFFGGHE3??FFEHGCCBFFC@FHIGFFFFIII	NM:i:1	MD:Z:0T50
+SALLY:187:B02JLABXX:5:1101:1919:1960	81	U00096	2237700	30	51M	*	0	0	GCGTTCGGGCCAGACAGCCAGGCGTCCATCTTATCTTTCGCCTGAGCGGTC	###############B(;?BGGBFGBD??99?9CBAE@AFFDFD1D?=11#	NM:i:1	MD:Z:50G0
+SALLY:187:B02JLABXX:5:1101:1900:1973	65	U00096	442575	30	51M	*	0	0	GCGGTGGAACTGTTTAACGGTTTCAACCAGCAGAGTGCTATCGCGAAAACA	#1=DBDFFHHGHHHIJJJJJGHIHJJHIII=FGAGBFGGIJJJJJEIJJGG	NM:i:1	MD:Z:0A50
+\end{verbatim}
+
+\subsection{BED File Format}
+
+\begin{verbatim}
+chrA	880	911	U	0	+
+chrA	883	914	U	0	+
+chrA	922	953	U	0	+
+chrA	931	962	U	0	+
+chrA	950	981	U	0	+
+chrA	951	982	U	0	+
+chrA	959	990	U	0	+
+chrA	963	994	U	0	+
+chrA	965	996	U	0	+
+chrA	745	776	U	0	-
+\end{verbatim}
+
+\subsection{CSEM File Format}
+
+\begin{verbatim}
+chr11	114728597	114728633	HWUSI-EAS610:1:1:0:647#0/1	1000.00	+
+chr5	138784256	138784292	HWUSI-EAS610:1:1:0:498#0/1	1000.00	-
+chr3	8516078	8516114	HWUSI-EAS610:1:1:0:631#0/1	1000.00	-
+chr7	123370863	123370899	HWUSI-EAS610:1:1:0:508#0/1	1000.00	+
+chr4	137837148	137837184	HWUSI-EAS610:1:1:0:1790#0/1	1000.00	-
+chr11	84363281	84363317	HWUSI-EAS610:1:1:0:862#0/1	1000.00	-
+chr7	66950830	66950866	HWUSI-EAS610:1:1:0:1675#0/1	371.61	-
+chr7	66938672	66938708	HWUSI-EAS610:1:1:0:1675#0/1	628.39	-
+chr15	57549345	57549381	HWUSI-EAS610:1:1:0:969#0/1	1000.00	-
+chr9	3012912	3012948	HWUSI-EAS610:1:1:0:194#0/1	448.96	+
+\end{verbatim}
+
+\clearpage
+
+\section{Appendix: Example Lines of Aligned Read Files for PET ChIP-Seq Data}\label{example_aligned_PET}
+
+\subsection{Eland Result File Format}
+
+\begin{verbatim}
+>SALLY:194:B045RABXX:3:1101:1225:2090	GAGCAACGGCCCGAAGGGCGAGACGAAGTCGAGTCA	U0	1	0	0	U00096	779900	R	..
+>SALLY:194:B045RABXX:3:1101:1225:2090	GTTGCCACGGGATATCAAACAAACCGAAAGCAACGA	U0	1	0	0	U00096	779735	F	..
+>SALLY:194:B045RABXX:3:1101:1197:2174	GGTTGTCTTCCGGGTTGAGGCGCGGAATGTCGAACG   U0	1	0	0	U00096	4080317	R	..
+>SALLY:194:B045RABXX:3:1101:1197:2174	GTTTTCAGCTCAGCAACGCGCTCGCTCGCCAGCGTT   U0	1	0	0	U00096	4080169	F	..
+>SALLY:194:B045RABXX:3:1101:1448:2081	TGAAATTCATCATGGCTGATACTGGCTTTGGTAAAA   U0	1	0	0	U00096	2474369	F	..
+>SALLY:194:B045RABXX:3:1101:1448:2081	CCAATGGGTAAAATAGCGGGTAAAATATTTCTCACA   U0	1	0	0	U00096	2474522	R	..
+>SALLY:194:B045RABXX:3:1101:1460:2102	ACATGTTCTTATTCTTACCTCGTAATATTTAACGCT   U0	1	0	0	U00096	2127671	R	..
+>SALLY:194:B045RABXX:3:1101:1460:2102	TTTCCAGATACTCACGGGTGCCGTCGTTGGAACCGC   U0	1	0	0	U00096	2127518	F	..
+>SALLY:194:B045RABXX:3:1101:1319:2153	AATGAAATGCTGTTTTCATAAAAAATAAAATTGAAG   U0	1	0	0	U00096	1785339	R	..
+>SALLY:194:B045RABXX:3:1101:1319:2153	TTATCTTTGTATATTTAACCGGATGAAAAAAACGGT   U1	0	1	0	U00096	1785190	F	..
+\end{verbatim}
+
+\subsection{SAM File Format}
+
+\begin{verbatim}
+@HD	VN:1.0	SO:unsorted
+@SQ	SN:U00096.2	LN:4639675
+@PG	ID:Bowtie	VN:0.12.5	CL:"bowtie -q -X 1000 --fr -p 4 -S -n 2 -e 70 -l 28 --pairtries 100 --maxbts 125 -y -k 1 --phred33-quals /mnt/data/galaxy/tmp/tmpFJuBnU/tmpGke2nl -1 /mnt/data/galaxy/database/files/002/dataset_2246.dat -2 /mnt/data/galaxy/database/files/002/dataset_2247.dat"
+AMELIA:216:C0LAWACXX:4:1101:1138:2071	99	U00096.2	182616	255	65M	=	182749	198	TTGCTGGTACTTTCACAGGGACGAGTCGTCGATATCGATGACGCGGTAGCACGTCATCTGCACGG	@@CFFFFDHFFFHBG@FAFEGGGGHHIIIBHIGCHHIHGIIIIIHIHHDDD@A>;?BDDDCCDDD	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1138:2071	147	U00096.2	182749	255	65M	=	182616	-198	GTGAACCAGAGAATCTGCGTAAATATGGCGAACTGGTCTGCATGACGGCTGAAATGATGCTGGAA	HIIIGJIJJJIJIHGGGFGCIJGIIJJIHDIHGJJJJJIIJJIIHIFJIIJJHHHHHFDFFFC@C	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1425:2097	99	U00096.2	4381249	255	65M	=	4381349	165	TGTTTACCTTTGGCGTAGAGCCAAATATTGGCAAAGAAAAACCGACCTTTGTGTACCACTTTCCA	CCBFFFFFHHHHHJJIJJJJJJJJJJJJJJJJJJJJJJJJIJIJJJJJJJIIIIJJGIHHHHGFF	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1425:2097	147	U00096.2	4381349	255	65M	=	4381249	-165	AGATCATCGGGTCGCTGAACGCTTTGAGGTTTATTATAAAGGTATTGAGCTGGCGAATGGTTTCC	FFFFFFHHJJJJJIHIJJIJIJIEIJJJIJIJJJJJIJJJJJJJJJIIJJIJHHHHHFFFFFCCC	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1213:2207	163	U00096.2	3632282	255	65M	=	3632429	212	CCATGTTGCAGGAAATAGATGATATGCAGGCCAAAATCGTTATCTATTTGCGAATCATTATCCAG	CCCFFFFFHHHHHJJJJJJJIIGJJJJIJJJJJGIJIGIGIJJJJJJJJJIJJJJJJJJIJGIJJ	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1213:2207	83	U00096.2	3632429	255	65M	=	3632282	-212	TTATCTTGATGGCTTGAAGAGATGTTTCTAATCTGATTGTCAATTGCTTTCATAAATAACCTGTG	HJIGIJIJJIGEJIIIJJIJGJJIIIJJIJJJIIHIGJJJJJJJIIJIJJJIHHHHHEDFFFCCB	XA:i:0	MD:Z:65	NM:i:0
+AMELIA:216:C0LAWACXX:4:1101:1167:2244	163	U00096.2	4135404	255	65M	=	4135541	202	CCAGTTTCTGCAAATCGACCTTCTGGCTGCGCAATGCGCGGATCACCACATCGGAACATACACCG	CCCFFFFFHHHHHJJJIJJJJJIJJJJJJJJJJJJIJJJJJJJHGHGFFFFFDDCDDDDDDDDDD	XA:i:0	MD:Z:65	NM:i:0
+\end{verbatim}
+
+\clearpage
+
+\section{Appendix: Chromosome Information File}\label{example_chrfile}
+
+The first and second columns indicate chromosome ID and its size, respectively.
+
+\begin{verbatim}
+chr1    249250621
+chr2    243199373
+chr3    198022430
+chr4    191154276
+chr5    180915260
+chr6    171115067
+chr7    159138663
+chr8    146364022
+chr9    141213431
+chr10   135534747
+\end{verbatim}
+
+\end{document}
Binary file mosaics/inst/doc/mosaics-example.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/BinData-class.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,88 @@
+\name{BinData-class}
+\Rdversion{1.1}
+\docType{class}
+\alias{BinData-class}
+\alias{chrID,BinData-method}
+\alias{coord,BinData-method}
+\alias{gcContent,BinData-method}
+\alias{input,BinData-method}
+\alias{mappability,BinData-method}
+%\alias{mosaicsFit,BinData-method}
+\alias{plot,BinData,missing-method}
+\alias{print,BinData-method}
+\alias{show,BinData-method}
+\alias{tagCount,BinData-method}
+
+\title{Class "BinData" }
+\description{
+This class represents bin-level ChIP-seq data.
+}
+\section{Objects from the Class}{
+Objects can be created by calls of the form \code{new("BinData", ...)}.
+}
+\section{Slots}{
+  \describe{
+    \item{\code{chrID}:}{Object of class \code{"character"},
+        a vector of chromosome IDs. }
+    \item{\code{coord}:}{Object of class \code{"numeric"},
+        a vector of genomic coordinates. }
+    \item{\code{tagCount}:}{Object of class \code{"numeric"},
+        a vector of tag counts of ChIP sample. }
+    \item{\code{mappability}:}{Object of class \code{"numeric"},
+        a vector of mappability score. }
+    \item{\code{gcContent}:}{Object of class \code{"numeric"},
+        a vector of GC content score. }
+    \item{\code{input}:}{Object of class \code{"numeric"},
+        a vector of tag counts of matched control sample. }
+    \item{\code{dataType}:}{Object of class \code{"character"},
+        indicating how reads were processed. Possible values are
+        "unique" (only uniquely aligned reads were retained)
+        and "multi" (reads aligned to multiple locations were also retained). }
+  }
+}
+\section{Methods}{
+  \describe{
+    \item{mosaicsFit}{\code{signature(object = "BinData")}:
+        fit a MOSAiCS model using a bin-level ChIP-seq data. }
+    \item{plot}{\code{signature(x = "BinData", y = "missing", plotType = NULL )}:
+        provide exploratory plots of mean ChIP tag counts.
+        This method plots mean ChIP tag counts versus mappability score,
+        GC content score, and Control tag counts, with 95\% confidence intervals,
+    for \code{plotType="M"}, \code{plotType="GC"}, and \code{plotType="input"}, respectively.
+        \code{plotType="M|input"} and \code{plotType="GC|input"} provide
+        plots of mean ChIP tag counts versus mappability and GC content score, respectively,
+        conditional on Control tag counts.
+        If \code{plotType} is not specified, this method plots histogram of ChIP tag counts. }
+    \item{print}{\code{signature(x = "BinData")}:
+        return bin-level data in data frame format. }
+    \item{show}{\code{signature(object = "BinData")}: provide brief summary of the object. }
+     }
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{readBins}}, \code{\link{mosaicsFit}}.
+}
+\examples{
+showClass("BinData")
+\dontrun{
+library(mosaicsExample)
+data(exampleBinData)
+
+exampleBinData
+print(exampleBinData)[1:10,]
+plot(exampleBinData)
+plot( exampleBinData, plotType="M" )
+plot( exampleBinData, plotType="GC" )
+plot( exampleBinData, plotType="input" )
+plot( exampleBinData, plotType="M|input" )
+plot( exampleBinData, plotType="GC|input" )
+
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO" )
+}
+}
+\keyword{classes}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/MosaicsFit-class.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,71 @@
+\name{MosaicsFit-class}
+\Rdversion{1.1}
+\docType{class}
+\alias{MosaicsFit-class}
+%\alias{mosaicsPeak,MosaicsFit-method}
+%\alias{estimates,MosaicsFit-method}
+\alias{plot,MosaicsFit,ANY-method}
+\alias{print,MosaicsFit-method}
+\alias{show,MosaicsFit-method}
+
+\title{Class "MosaicsFit" }
+\description{
+This class represents MOSAiCS model fit.
+}
+\section{Objects from the Class}{
+Objects can be created by calls of the form \code{new("MosaicsFit", ...)}.
+}
+\section{Slots}{
+  \describe{
+    \item{\code{mosaicsEst}:}{Object of class \code{"MosaicsFitEst"},
+        representing estimates of MOSAiCS model fit. }
+    \item{\code{mosaicsParam}:}{Object of class \code{"MosaicsFitParam"},
+        representing tuning parameters for fitting MOSAiCS model. }
+    \item{\code{chrID}:}{Object of class \code{"character"},
+        a vector of chromosome IDs. }
+    \item{\code{coord}:}{Object of class \code{"numeric"},
+        a vector of genomic coordinates. }
+    \item{\code{tagCount}:}{Object of class \code{"numeric"},
+        a vector of tag counts of ChIP sample. }
+    \item{\code{bic1S}:}{Object of class \code{"numeric"},
+        Bayesian Information Criterion (BIC) value of one-signal-component model. }
+    \item{\code{bic2S}:}{Object of class \code{"numeric"},
+        Bayesian Information Criterion (BIC) value of two-signal-component model. }
+  }
+}
+\section{Methods}{
+  \describe{
+    \item{estimates}{\code{signature(object = "MosaicsFit")}:
+        extract estimates from MOSAiCS model fit. }
+    \item{mosaicsPeak}{\code{signature(object = "MosaicsFit")}:
+        call peaks using MOSAiCS model fit. }
+    \item{plot}{\code{signature(x = "MosaicsFit", y = "missing")}:
+        draw Goodness of Fit (GOF) plot. }
+    \item{print}{\code{signature(x = "MosaicsFit")}:
+        (not supported yet) }
+    \item{show}{\code{signature(object = "MosaicsFit")}:
+        provide brief summary of the object. }
+     }
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2010), "A Statistical Framework
+for the Analysis of ChIP-Seq Data", To appear in \emph{Journal of the American Statistical Association} (\url{http://pubs.amstat.org/doi/abs/10.1198/jasa.2011.ap09706}).
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{mosaicsFit}}, \code{\link{mosaicsPeak}}, \code{\link{estimates}}.
+}
+\examples{
+showClass("MosaicsFit")
+\dontrun{
+library(mosaicsExample)
+data(exampleFit)
+
+exampleFit
+plot(exampleFit)
+estimates(exampleFit)
+
+examplePeak <- mosaicsPeak( exampleFit, signalModel = "2S", FDR = 0.05 )
+}
+}
+\keyword{classes}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/MosaicsPeak-class.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,59 @@
+\name{MosaicsPeak-class}
+\Rdversion{1.1}
+\docType{class}
+\alias{MosaicsPeak-class}
+\alias{bdBin,MosaicsPeak-method}
+\alias{empFDR,MosaicsPeak-method}
+%\alias{export,MosaicsPeak-method}
+\alias{print,MosaicsPeak-method}
+\alias{show,MosaicsPeak-method}
+
+\title{Class "MosaicsPeak" }
+\description{
+This class represents peak calling results.
+}
+\section{Objects from the Class}{
+Objects can be created by calls of the form \code{new("MosaicsPeak", ...)}.
+}
+\section{Slots}{
+  \describe{
+    \item{\code{peakList}:}{Object of class \code{"data.frame"}, representing peak list. }
+    \item{\code{peakParam}:}{Object of class \code{"MosaicsPeakParam"},
+        representing parameters for peak calling. }
+    \item{\code{bdBin}:}{Object of class \code{"numeric"},
+    		representing a vector of bounded bins. }
+    \item{\code{empFDR}:}{Object of class \code{"numeric"},
+    		representing empirical FDR. }
+  }
+}
+\section{Methods}{
+  \describe{
+    \item{export}{\code{signature(object = "MosaicsPeak")}: export peak list into text files. }
+    \item{print}{\code{signature(x = "MosaicsPeak")}: return peak list in data frame format. }
+    \item{show}{\code{signature(object = "MosaicsPeak")}: provide brief summary of the object. }
+     }
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{mosaicsPeak}}, \code{\link{export}}.
+}
+\examples{
+showClass("MosaicsPeak")
+\dontrun{
+library(mosaicsExample)
+data(exampleFit)
+examplePeak <- mosaicsPeak( exampleFit, signalModel = "2S", FDR = 0.05 )
+
+examplePeak
+print(examplePeak)[1:10, ]
+export( examplePeak, type = "txt", filename = "./TSpeakList.txt" )
+export( examplePeak, type = "bed", filename = "./TSpeakList.bed" )
+export( examplePeak, type = "gff", filename = "./TSpeakList.gff" )
+}
+}
+\keyword{classes}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/constructBins.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,129 @@
+\name{constructBins}
+\alias{constructBins}
+\title{
+Construct bin-level ChIP-sep data from an aligned read file
+}
+\description{
+Preprocess and construct bin-level ChIP-sep data from an aligned read file.
+}
+\usage{
+constructBins( infile=NULL, fileFormat=NULL, outfileLoc="./",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    PET=FALSE, fragLen=200, binSize=200, capping=0, perl = "perl" )
+}
+\arguments{
+  \item{infile}{
+    Name of the aligned read file to be processed.
+}
+  \item{fileFormat}{
+    Format of the aligned read file to be processed.
+    Currently, \code{constructBins} permits the following aligned read file formats
+    for SET data (\code{PET = FALSE}):
+        \code{"eland_result"} (Eland result), \code{"eland_extended"} (Eland extended),
+        \code{"eland_export"} (Eland export), \code{"bowtie"} (default Bowtie),
+        \code{"sam"} (SAM), \code{"bed"} (BED), and \code{"csem"} (CSEM).
+    For PET data (\code{PET = TRUE}), the following aligned read file formats are allowed:
+        \code{"eland_result"} (Eland result) and \code{"sam"} (SAM).
+}
+  \item{outfileLoc}{
+    Directory of processed bin-level files.
+    By default, processed bin-level files are exported to the current directory.
+}
+  \item{byChr}{
+    Construct separate bin-level file for each chromosome?
+    Possible values are \code{TRUE} or \code{FALSE}.
+    If \code{byChr=FALSE}, bin-level data for all chromosomes are exported to one file.
+    If \code{byChr=TRUE}, bin-level data for each chromosome is exported to a separate file.
+    Default is \code{FALSE}.
+}
+  \item{useChrfile}{
+    Is the file for chromosome info provided?
+    Possible values are \code{TRUE} or \code{FALSE}.
+    If \code{useChrfile=FALSE}, it is assumed that the file for chromosome info is not provided.
+    If \code{useChrfile=TRUE}, it is assumed that the file for chromosome info is provided.
+    Default is \code{FALSE}.
+}
+  \item{chrfile}{
+    Name of the file for chromosome info.
+    In this file, the first and second columns are ID and size of each chromosome, respectively.
+}
+  \item{excludeChr}{
+    Vector of chromosomes that will be excluded from the analysis.
+    This argument is ignored if \code{useChrfile=TRUE}.
+}
+  \item{PET}{
+    Is the file paired-end tag (PET) data?
+    If \code{PET=FALSE}, it is assumed that the file is SET data.
+    If \code{PET=TRUE}, it is assumed that the file is PET data.
+    Default is \code{FALSE} (SET data).
+}
+  \item{fragLen}{
+    Average fragment length. Default is 200.
+    This argument is ignored if \code{PET=TRUE}.
+}
+  \item{binSize}{
+    Size of bins. Default is 200.
+}
+  \item{capping}{
+    Maximum number of reads allowed to start at each nucleotide position.
+    To avoid potential PCR amplification artifacts, the maximum number of reads
+    that can start at a nucleotide position is capped at \code{capping}.
+    Capping is not applied if non-positive value is used for \code{capping}.
+    Default is 0 (no capping).
+}
+  \item{perl}{
+    Name of the perl executable to be called. Default is \code{"perl"}.
+}
+}
+\details{
+Bin-level files are constructed from the aligned read file and
+exported to the directory specified in \code{outfileLoc} argument.
+If \code{byChr=FALSE}, bin-level files are named
+as \code{[infileName]_fragL[fragLen]_bin[binSize].txt} for SET data (\code{PET = FALSE})
+and \code{[infileName]_bin[binSize].txt} for PET data (\code{PET = TRUE}).
+If \code{byChr=TRUE}, bin-level files are named
+as \code{[infileName]_fragL[fragLen]_bin[binSize]_[chrID].txt}for SET data (\code{PET = FALSE})
+and \code{[infileName]_bin[binSize]_[chrID].txt} for PET data (\code{PET = TRUE}),
+where \code{chrID} is chromosome IDs that reads align to.
+These chromosome IDs are extracted from the aligned read file.
+
+If the file for chromosome information is provided (\code{useChrfile=TRUE} and \code{chrfile} is not NULL),
+only the chromosomes specified in the file will be considered.
+Chromosomes that are specified in \code{excludeChr}
+will not be included in the processed bin-level files.
+\code{excludeChr} argument is ignored if \code{useChrfile=TRUE}.
+Constructed bin-level files can be loaded into the R environment using the method \code{readBins}.
+
+\code{constructBins} currently supports the following aligned read file formats
+for SET data (\code{PET = FALSE}):
+Eland result (\code{"eland_result"}), Eland extended (\code{"eland_extended"}),
+Eland export (\code{"eland_export"}), default Bowtie (\code{"bowtie"}),
+SAM (\code{"sam"}), BED (\code{"bed"}), and CSEM (\code{"csem"}).
+For PET data (\code{PET = TRUE}), the following aligned read file formats are allowed:
+\code{"eland_result"} (Eland result) and \code{"sam"} (SAM).
+
+If input file format is neither BED nor CSEM BED,
+this method retains only reads mapping uniquely to the reference genome.
+}
+\value{
+Processed bin-level files are exported to the directory specified in \code{outfileLoc}.
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{readBins}}, \code{\linkS4class{BinData}}.
+}
+\examples{
+\dontrun{
+constructBins( infile="/scratch/eland/STAT1_eland_results.txt",
+    fileFormat="eland_result", outfileLoc="/scratch/eland/",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr="chrM",
+    PET = FALSE, fragLen=200, binSize=200, capping=0 )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/estimates.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,56 @@
+\name{estimates}
+\alias{estimates}
+\alias{estimates,MosaicsFit-method}
+\title{
+Extract estimates of the fitted MOSAiCS model
+}
+\description{
+Extract estimates from \code{MosaicsFit} class object, which is a fitted MOSAiCS model.
+}
+\usage{
+estimates( object, ... )
+\S4method{estimates}{MosaicsFit}( object )
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+    \item{object}{Object of class \code{MosaicsFit}, which represents
+        a fitted MOSAiCS model obtained using method \code{mosaicsFit}. }
+    \item{...}{ Other parameters to be passed through to generic \code{estimates}.}
+}
+\value{
+Returns a list with components:
+    \item{pi0}{ Mixing proportion of background component. }
+    \item{a}{ Parameter for background component. }
+    \item{betaEst}{ Parameter for background component (coefficient estimates). }
+    \item{muEst}{ Parameter for background component. }
+    \item{b}{ Parameter for one-signal-component model. }
+    \item{c}{ Parameter for one-signal-component model. }
+    \item{p1}{ Parameter for two-signal-component model (mixing proportion of signal components). }
+    \item{b1}{ Parameter for two-signal-component model (the first signal component). }
+    \item{c1}{ Parameter for two-signal-component model (the first signal component). }
+    \item{b2}{ Parameter for two-signal-component model (the second signal component). }
+    \item{c2}{ Parameter for two-signal-component model (the second signal component). }
+    \item{analysisType}{ Analysis type.
+        Possible values are "OS" (one-sample analysis),
+        "TS" (two-sample analysis using mappability and GC content), and
+        "IO" (two-sample analysis without using mappability and GC content). }
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{mosaicsFit}}, \code{\linkS4class{MosaicsFit}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+data(exampleFit)
+
+estimates(exampleFit)
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/export.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,57 @@
+\name{export}
+\alias{export}
+\alias{export,MosaicsPeak-method}
+\title{
+Export peak calling results to text files
+}
+\description{
+Export peak calling results to text files in TXT, BED, or GFF file formats.
+}
+\usage{
+export(object, ...)
+\S4method{export}{MosaicsPeak}( object, type=NA, filename=NA )
+}
+\arguments{
+    \item{object}{Object of class \code{MosaicsPeak},
+        peak calling results obtained using method \code{mosaicsPeak}. }
+    \item{type}{Format of the exported file.
+        Possible values are \code{"txt"}, \code{"bed"}, and \code{"gff"}. See Details. }
+    \item{filename}{Name of the exported file. }
+    \item{...}{ Other parameters to be passed through to generic \code{export}.}
+}
+\details{
+TXT file format (\code{type="txt"}) exports peak calling results in the most informative way.
+Columns include chromosome ID, peak start position, peak end position, peak width,
+average posterior probability, minimum posterior probability,
+average ChIP tag count, maximum ChIP tag count (always),
+average input tag count, average input tag count scaled by sequencing depth,
+average log base 2 ratio of ChIP over input tag counts
+(if matched control sample is also provided),
+average mappability score, and average GC content score
+(when mappability and GC content scores are used in the analysis) in each peak.
+\code{type="bed"} and \code{type="gff"} export peak calling results in standard BED and GFF file formats,
+respectively, where score is the average ChIP tag counts in each peak.
+If no peak is detected, \code{export} method will not generate any file.
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{mosaicsPeak}}, \code{\linkS4class{MosaicsPeak}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+data(exampleFit)
+
+examplePeak <- mosaicsPeak( exampleFit, signalModel = "2S", FDR = 0.05 )
+export( examplePeak, type = "txt", filename = "./TSpeakList.txt" )
+export( examplePeak, type = "bed", filename = "./TSpeakList.bed" )
+export( examplePeak, type = "gff", filename = "./TSpeakList.gff" )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/generateWig.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,128 @@
+\name{generateWig}
+\alias{generateWig}
+\title{
+Construct wiggle files from an aligned ChIP-sep read file
+}
+\description{
+Construct wiggle files from an aligned ChIP-sep read file.
+}
+\usage{
+generateWig( infile=NULL, fileFormat=NULL, outfileLoc="./",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    PET=FALSE, fragLen=200, span=200, capping=0, normConst=1, perl = "perl" )
+}
+\arguments{
+  \item{infile}{
+    Name of the aligned read file to be processed.
+}
+  \item{fileFormat}{
+    Format of the aligned read file to be processed.
+    Currently, \code{generateWig} permits the following aligned read file formats
+    for SET data (\code{PET = FALSE}):
+        \code{"eland_result"} (Eland result), \code{"eland_extended"} (Eland extended),
+        \code{"eland_export"} (Eland export), \code{"bowtie"} (default Bowtie),
+        \code{"sam"} (SAM), \code{"bed"} (BED), and \code{"csem"} (CSEM).
+    For PET data (\code{PET = TRUE}), the following aligned read file formats are allowed:
+        \code{"eland_result"} (Eland result) and \code{"sam"} (SAM).
+}
+  \item{outfileLoc}{
+    Directory of processed wiggle files.
+    By default, processed wiggle files are exported to the current directory.
+}
+  \item{byChr}{
+    Construct separate wiggle file for each chromosome?
+    Possible values are \code{TRUE} or \code{FALSE}.
+    If \code{byChr=FALSE}, all chromosomes are exported to one file.
+    If \code{byChr=TRUE}, each chromosome is exported to a separate file.
+    Default is \code{FALSE}.
+}
+  \item{useChrfile}{
+    Is the file for chromosome info provided?
+    Possible values are \code{TRUE} or \code{FALSE}.
+    If \code{useChrfile=FALSE}, it is assumed that the file for chromosome info is not provided.
+    If \code{useChrfile=TRUE}, it is assumed that the file for chromosome info is provided.
+    Default is \code{FALSE}.
+}
+  \item{chrfile}{
+    Name of the file for chromosome info.
+    In this file, the first and second columns are ID and size of each chromosome, respectively.
+}
+  \item{excludeChr}{
+    Vector of chromosomes that will be excluded from the analysis.
+    This argument is ignored if \code{useChrfile=TRUE}.
+}
+  \item{PET}{
+    Is the file paired-end tag (PET) data?
+    If \code{PET=FALSE}, it is assumed that the file is SET data.
+    If \code{PET=TRUE}, it is assumed that the file is PET data.
+    Default is \code{FALSE} (SET data).
+}
+  \item{fragLen}{
+    Average fragment length. Default is 200.
+    This argument is ignored if \code{PET=TRUE}.
+}
+  \item{span}{
+    Span used in wiggle files. Default is 200.
+}
+  \item{capping}{
+    Maximum number of reads allowed to start at each nucleotide position.
+    To avoid potential PCR amplification artifacts, the maximum number of reads
+    that can start at a nucleotide position is capped at \code{capping}.
+    Capping is not applied if non-positive value is used for \code{capping}.
+    Default is 0 (no capping).
+}
+  \item{normConst}{
+    Normalizing constant to scale values in each position.
+}
+  \item{perl}{
+    Name of the perl executable to be called. Default is \code{"perl"}.
+}
+}
+\details{
+Wiggle files are constructed from the aligned read file and
+exported to the directory specified in \code{outfileLoc} argument.
+If \code{byChr=FALSE}, wiggle files are named
+as \code{[infileName]_fragL[fragLen]_span[span].wig} for SET data (\code{PET = FALSE})
+and \code{[infileName]_span[span].wig} for PET data (\code{PET = TRUE}).
+If \code{byChr=TRUE}, wiggle files are named
+as \code{[infileName]_fragL[fragLen]_span[span]_[chrID].wig} for SET data (\code{PET = FALSE})
+and \code{[infileName]_span[span]_[chrID].wig} for PET data (\code{PET = TRUE}),
+where \code{chrID} is chromosome IDs that reads align to.
+These chromosome IDs are extracted from the aligned read file.
+
+If the file for chromosome information is provided (\code{useChrfile=TRUE} and \code{chrfile} is not NULL),
+only the chromosomes specified in the file will be considered.
+Chromosomes that are specified in \code{excludeChr}
+will not be included in the processed wiggle files.
+\code{excludeChr} argument is ignored if \code{useChrfile=TRUE}.
+
+\code{generateWig} currently supports the following aligned read file formats
+for SET data (\code{PET = FALSE}):
+Eland result (\code{"eland_result"}), Eland extended (\code{"eland_extended"}),
+Eland export (\code{"eland_export"}), default Bowtie (\code{"bowtie"}),
+SAM (\code{"sam"}), BED (\code{"bed"}), and CSEM (\code{"csem"}).
+For PET data (\code{PET = TRUE}), the following aligned read file formats are allowed:
+\code{"eland_result"} (Eland result) and \code{"sam"} (SAM).
+
+If input file format is neither BED nor CSEM BED,
+this method retains only reads mapping uniquely to the reference genome.
+}
+\value{
+Processed wig files are exported to the directory specified in \code{outfileLoc}.
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\examples{
+\dontrun{
+generateWig( infile="/scratch/eland/STAT1_eland_results.txt",
+    fileFormat="eland_result", outfileLoc="/scratch/eland/",
+    byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr="chrM",
+    PET = FALSE, fragLen=200, span=200, capping=0, normConst=1 )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/mosaics-internal.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,48 @@
+\name{mosaics-internal}
+\title{Internal mosaics objects}
+\alias{.adapGridMosaicsZ0_IO}
+\alias{.adapGridMosaicsZ0_OS}
+\alias{.adapGridMosaicsZ0_TS}
+\alias{.calcModelBIC}
+\alias{.calcPN}
+\alias{.calMDZ1_2S}
+\alias{.computeStat}
+\alias{.emZ1_2S}
+\alias{.eStepZ1_2S}
+\alias{.exportBED}
+\alias{.exportGFF}
+\alias{.exportTXT}
+\alias{.getParamZ0}
+\alias{.getPH_1S}
+\alias{.getPH_2S}
+\alias{.getPi0}
+\alias{.margDist_1S}
+\alias{.margDist_2S}
+\alias{.margDistZ1_1S}
+\alias{.margDistZ1_2S}
+\alias{.mosaicsFit_IO}
+\alias{.mosaicsFit_OS}
+\alias{.mosaicsFit_TS}
+\alias{.mosaicsZ0_IO}
+\alias{.mosaicsZ0_OS}
+\alias{.mosaicsZ0_TS}
+\alias{.mosaicsZ1_1S}
+\alias{.mosaicsZ1_2S}
+\alias{.mStepZ1_2S}
+\alias{.peakCall}
+\alias{.plotGOF}
+\alias{.processBin_MGC}
+\alias{.processBin_MGCX}
+\alias{.processBin_X}
+\alias{.rlmFit_IO}
+\alias{.rlmFit_OS}
+\alias{.rlmFit_TS}
+\alias{.reportSummary}
+\alias{.validDataType}
+\alias{.validLocation}
+\alias{.validType}
+\alias{conv_1S}
+\alias{conv_2S}
+\description{Internal mosaics objects.}
+\details{These are not to be called by the user.}
+\keyword{internal}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/mosaics-package.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,81 @@
+\name{mosaics-package}
+\alias{mosaics-package}
+\alias{mosaics}
+\docType{package}
+\title{
+MOSAiCS (MOdel-based one and two Sample Analysis and Inference for ChIP-Seq)
+}
+\description{
+This package provides functions for fitting MOSAiCS,
+a statistical framework to analyze one-sample or two-sample ChIP-seq data.
+}
+\details{
+\tabular{ll}{
+Package: \tab mosaics\cr
+Type: \tab Package\cr
+Version: \tab 1.5.4\cr
+Date: \tab 2012-09-24\cr
+License: \tab GPL (>= 2)\cr
+LazyLoad: \tab yes\cr
+}
+This package contains three main classes, \code{BinData}, \code{MosaicsFit}, and \code{MosaicsPeak},
+which represent bin-level ChIP-seq data, MOSAiCS model fit, and MOSAiCS peak calling results, respectively.
+This package contains four main methods,
+\code{constructBins}, \code{readBins}, \code{mosaicsFit}, and \code{mosaicsPeak}.
+\code{constructBins} method constructs bin-level files from the aligned read file.
+\code{readBins} method imports bin-level data and construct \code{BinData} class object.
+\code{mosaicsFit} method fits a MOSAiCS model using \code{BinData} class object and
+ constructs \code{MosaicsFit} class object.
+\code{mosaicsPeak} method calls peaks using \code{MosaicsFit} class object and
+ construct \code{MosaicsPeak} class object.
+\code{MosaicsPeak} class object can be exported as text files or transformed into data frame,
+which can be used for the downstream analysis.
+This package also provides methods for simple exploratory analysis.
+
+The \code{mosaics} package companion website, \url{http://www.stat.wisc.edu/~keles/Software/mosaics/},
+provides preprocessing scripts, preprocessed files for diverse reference genomes,
+and easy-to-follow instructions.
+We encourage questions or requests regarding \code{mosaics} package to be posted on
+our Google group, \url{http://groups.google.com/group/mosaics_user_group}.
+Please check the vignette for further details on the \code{mosaics} package and these websites.
+}
+\author{
+Dongjun Chung, Pei Fen Kuan, Sunduz Keles
+
+Maintainer: Dongjun Chung <chungdon@stat.wisc.edu>
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\keyword{ package }
+\seealso{
+\code{\link{constructBins}}, \code{\link{readBins}}, \code{\link{mosaicsFit}}, \code{\link{mosaicsPeak}} ,
+\code{\linkS4class{BinData}}, \code{\linkS4class{MosaicsFit}}, \code{\linkS4class{MosaicsPeak}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+
+exampleBinData <- readBins( type=c("chip","input"),
+    fileName=c( system.file("extdata/chip_chr21.txt", package="mosaicsExample"),
+    system.file("extdata/input_chr21.txt", package="mosaicsExample") ) )
+exampleBinData
+print(exampleBinData)[1:10, ]
+plot(exampleBinData)
+plot( exampleBinData, plotType="input" )
+
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO" )
+exampleFit
+plot(exampleFit)
+estimates(exampleFit)
+
+examplePeak <- mosaicsPeak( exampleFit, signalModel = "2S", FDR = 0.05 )
+examplePeak
+print(examplePeak)[1:10, ]
+export( examplePeak, type = "txt", filename = "./TSpeakList.txt" )
+export( examplePeak, type = "bed", filename = "./TSpeakList.bed" )
+export( examplePeak, type = "gff", filename = "./TSpeakList.gff" )
+}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/mosaicsFit.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,97 @@
+\name{mosaicsFit}
+\alias{mosaicsFit}
+\alias{mosaicsFit,BinData-method}
+\title{
+Fit MOSAiCS model
+}
+\description{
+Fit one-sample or two-sample MOSAiCS models with one signal component and two signal components.
+}
+\usage{
+mosaicsFit( object, ... )
+\S4method{mosaicsFit}{BinData}( object, analysisType="automatic", bgEst="automatic",
+    k=3, meanThres=NA, s=2, d=0.25, truncProb=0.999, parallel=FALSE, nCore=8 )
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+    \item{object}{Object of class \code{BinData},
+        bin-level ChIP-seq data imported using method \code{readBins}. }
+    \item{analysisType}{Analysis type.
+        Possible values are "OS" (one-sample analysis),
+        "TS" (two-sample analysis using mappability and GC content), and
+        "IO" (two-sample analysis without using mappability and GC content).
+        If \code{analysisType="automatic"},
+                this method tries to make the best guess for \code{analysisType},
+                based on the data provided. }
+    \item{bgEst}{Parameter to determine background estimation approach.
+        Possible values are "matchLow" (estimation using bins with low tag counts) and
+        "rMOM" (estimation using robust method of moment (MOM)).
+        If \code{bgEst="automatic"},
+                this method tries to make the best guess for \code{bgEst},
+                based on the data provided.}
+    \item{k}{Parameter for estimating background distribution.
+        It is not recommended for users to change this value. }
+    \item{meanThres}{Parameter for estimating background distribution.
+        Default is 1 for \code{analysisType="TS"} and 0 for \code{analysisType="OS"}.
+        Not relevant when \code{analysisType="IO"}. }
+    \item{s}{Parameter for estimating background distribution.
+        Relevant only when \code{analysisType="TS"}. Default is 2. }
+    \item{d}{Parameter for estimating background distribution.
+        Relevant only when \code{analysisType="TS"} or \code{analysisType="IO"}.
+        Default is 0.25. }
+    \item{truncProb}{Parameter for estimating background distribution.
+    		Relevant only when \code{analysisType="IO"}. }
+    \item{parallel}{Utilize multiple CPUs for parallel computing
+        using \code{"parallel"} package?
+        Possible values are \code{TRUE} (utilize multiple CPUs)
+        or \code{FALSE} (do not utilize multiple CPUs).
+        Default is \code{FALSE} (do not utilize multiple CPUs). }
+    \item{nCore}{Number of CPUs when parallel computing is utilized. }
+    \item{...}{ Other parameters to be passed through to generic \code{mosaicsFit}.}
+}
+\details{
+The imported data type constraints the analysis that can be implemented.
+If only data for ChIP sample and matched control sample
+(i.e., either \code{type=c("chip", "input")} or \code{type=c("chip", "input", "N")}
+was used in method \code{readBins}),
+only two-sample analysis without using mappability and GC content
+(\code{analysisType="IO"}) is allowed.
+If matched control data is available
+with mappability score, GC content score, and sequence ambiguity score,
+(i.e., \code{type=c("chip", "input", "M", "GC", "N")} was used in method \code{readBins}),
+user can do all of three analysis types
+(\code{analysisType="OS"}, \code{analysisType="TS"}, or \code{analysisType="IO"}).
+If there is no data for matched control sample
+(i.e., \code{type=c("chip", "M", "GC", "N")} was used in method \code{readBins}),
+only one-sample analysis (\code{analysisType="OS"}) is permitted.
+
+Parallel computing can be utilized for faster computing
+if \code{parallel=TRUE} and \code{parallel} package is loaded.
+\code{nCore} determines number of CPUs used for parallel computing.
+\code{meanThres}, \code{s}, \code{d}, and \code{truncProb} are
+the tuning parameters for estimating background distribution.
+The vignette and Kuan et al. (2011) provide further details about these tuning parameters.
+Please do not try different value for \code{k} argument.
+}
+\value{
+Construct \code{MosaicsFit} class object.
+}
+\references{
+Kuan, PF, D Chung, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{readBins}}, \code{\linkS4class{MosaicsFit}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+data(exampleBinData)
+
+exampleFit <- mosaicsFit( exampleBinData, analysisType="IO", bgEst="automatic" )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/mosaicsPeak.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,70 @@
+\name{mosaicsPeak}
+\alias{mosaicsPeak}
+\alias{mosaicsPeak,MosaicsFit-method}
+\title{
+Call peaks using fitted MOSAiCS model
+}
+\description{
+Call peaks using \code{MosaicsFit} class object, which is a fitted MOSAiCS model.
+}
+\usage{
+mosaicsPeak( object, ... )
+\S4method{mosaicsPeak}{MosaicsFit}( object, signalModel="2S", FDR=0.05,
+    binsize=NA, maxgap=200, minsize=50, thres=10 )
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+    \item{object}{Object of class \code{MosaicsFit},
+        a fitted MOSAiCS model obtained using function \code{mosaicsFit}. }
+    \item{signalModel}{Signal model.
+        Possible values are "1S" (one-signal-component model) and
+        "2S" (two-signal-component model). Default is "2S". }
+    \item{FDR}{False discovery rate. Default is 0.05. }
+    \item{binsize}{Size of each bin. Value should be positive integer.
+        If \code{binsize=NA}, \code{mosaicsPeak} function calcuates the value from data.
+        Default is \code{NA}. }
+    \item{maxgap}{Initial nearby peaks are merged if the distance (in bp) between them is less than \code{maxgap}. Default is 200. }
+    \item{minsize}{An initial peak is removed if its width is narrower than \code{minsize}. Default is 50. }
+    \item{thres}{A bin within initial peak is removed if its ChIP tag counts are less than \code{thres}. Default is 10. }
+    \item{...}{ Other parameters to be passed through to generic \code{mosaicsPeak}.}
+}
+\details{
+When peaks are called, proper signal model needs to be specified.
+The optimal choice for the number of signal components depends on the characteristics of ChIP-seq data.
+In order to support users in the choice of optimal signal model,
+Bayesian Information Criterion (BIC) values and Goodness of Fit (GOF) plot are provided
+for the fitted MOSAiCS model.
+BIC values and GOF plot can be obtained by applying \code{show} and \code{plot} methods,
+respectively, to the \code{MosaicsFit} class object, which is a fitted MOSAiCS model.
+\code{maxgap}, \code{minsize}, and \code{thres} are for refining initial peaks called
+using specified \code{signalModel} and \code{FDR}.
+
+If you use a bin size shorter than the average fragment length of the experiment,
+we recommend to set \code{maxgap} to the average fragment length
+and \code{minsize} to the bin size.
+If you set the bin size to the average fragment length or if bin size is larger than the average fragment length,
+set \code{maxgap} to the average fragment length and
+\code{minsize} to a value smaller than the average fragment length. See the vignette for further details.
+}
+\value{
+Construct \code{MosaicsPeak} class object.
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{mosaicsFit}} , \code{\linkS4class{MosaicsPeak}}, \code{\linkS4class{MosaicsFit}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+data(exampleFit)
+
+examplePeak <- mosaicsPeak( exampleFit, signalModel = "2S", FDR = 0.05 )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/mosaicsRunAll.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,253 @@
+\name{mosaicsRunAll}
+\alias{mosaicsRunAll}
+\title{
+Analyze ChIP-seq data using the MOSAiCS framework
+}
+\description{
+Construct bin-level ChIP-sep data from aligned read files of ChIP and matched control samples,
+fit a MOSAiCS model, call peaks, export peak calling results,
+and generate reports for diagnostics.
+}
+\usage{
+mosaicsRunAll(
+    chipFile=NULL, chipFileFormat=NULL,
+    controlFile=NULL, controlFileFormat=NULL,
+    binfileDir=NULL,
+    peakFile=NULL, peakFileFormat=NULL,
+    reportSummary=FALSE, summaryFile=NULL,
+    reportExploratory=FALSE, exploratoryFile=NULL,
+    reportGOF=FALSE, gofFile=NULL,
+    PET=FALSE, byChr=FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr=NULL,
+    FDR=0.05, fragLen=200, binSize=200, capping=0, bgEst="automatic", d=0.25,
+    signalModel="BIC", maxgap=200, minsize=50,
+    thres=10, parallel=FALSE, nCore=8 )
+}
+\arguments{
+  \item{chipFile}{
+    Name of the aligned read file of ChIP sample to be processed.
+}
+  \item{chipFileFormat}{
+    Format of the aligned read file of ChIP sample to be processed.
+    Currently, \code{mosaicsRunAll} permits the following aligned read file formats:
+        \code{"eland_result"} (Eland result), \code{"eland_extended"} (Eland extended),
+        \code{"eland_export"} (Eland export), \code{"bowtie"} (default Bowtie),
+        \code{"sam"} (SAM), \code{"bed"} (BED), and \code{"csem"} (CSEM BED).
+        Note that \code{"csem"} does not mean CSEM output file format, but CSEM BED file format.
+}
+  \item{controlFile}{
+    Name of the aligned read file of matched control sample to be processed.
+}
+  \item{controlFileFormat}{
+    Format of the aligned read file of matched control sample to be processed.
+    Currently, \code{mosaicsRunAll} permits the following aligned read file formats:
+        \code{"eland_result"} (Eland result), \code{"eland_extended"} (Eland extended),
+        \code{"eland_export"} (Eland export), \code{"bowtie"} (default Bowtie),
+        \code{"sam"} (SAM), \code{"bed"} (BED), and \code{"csem"} (CSEM BED).
+        Note that \code{"csem"} does not mean CSEM output file format, but CSEM BED file format.
+}
+  \item{binfileDir}{
+    Directory to store processed bin-level files.
+}
+  \item{peakFile}{
+    Name of the peak list generated from the analysis.
+}
+  \item{peakFileFormat}{
+    Format of the peak list generated from the analysis.
+    Possible values are \code{"txt"}, \code{"bed"}, and \code{"gff"}.
+}
+  \item{reportSummary}{
+    Report the summary of model fitting and peak calling?
+    Possible values are \code{TRUE} (YES) and \code{FALSE} (NO).
+    Default is \code{FALSE} (NO).
+}
+  \item{summaryFile}{
+    File name of the summary report of model fitting and peak calling.
+    The summary report is a text file.
+}
+  \item{reportExploratory}{
+    Report the exploratory analysis plots?
+    Possible values are \code{TRUE} (YES) and \code{FALSE} (NO).
+    Default is \code{FALSE} (NO).
+}
+  \item{exploratoryFile}{
+    Name of the file for exploratory analysis plots.
+    The exploratory analysis results are exported as a PDF file.
+}
+  \item{reportGOF}{
+    Report the goodness of fit (GOF) plots?
+    Possible values are \code{TRUE} (YES) and \code{FALSE} (NO).
+    Default is \code{FALSE} (NO).
+}
+  \item{gofFile}{
+    Name of the file for goodness of fit (GOF) plots.
+    The GOF plots are exported as a PDF file.
+}
+  \item{PET}{
+    Is the file paired-end tag (PET) data?
+    If \code{PET=FALSE}, it is assumed that the file is SET data.
+    If \code{PET=TRUE}, it is assumed that the file is PET data.
+    Default is \code{FALSE} (SET data).
+}
+  \item{byChr}{
+    Analyze ChIP-seq data for each chromosome separately or analyze it genome-wide?
+    Possible values are \code{TRUE} (chromosome-wise) and \code{FALSE} (genome-wide).
+    Default is \code{FALSE} (genome-wide analysis).
+}
+  \item{useChrfile}{
+    Is the file for chromosome info provided?
+    Possible values are \code{TRUE} or \code{FALSE}.
+    If \code{useChrfile=FALSE}, it is assumed that the file for chromosome info is not provided.
+    If \code{useChrfile=TRUE}, it is assumed that the file for chromosome info is provided.
+    Default is \code{FALSE}.
+}
+  \item{chrfile}{
+    Name of the file for chromosome info.
+    In this file, the first and second columns are ID and size of each chromosome, respectively.
+}
+  \item{excludeChr}{
+    Vector of chromosomes that will be excluded from the analysis.
+}
+  \item{FDR}{
+    False discovery rate. Default is 0.05.
+}
+  \item{fragLen}{
+    Average fragment length. Default is 200.
+}
+  \item{binSize}{
+    Size of bins. Default is 200.
+}
+  \item{capping}{
+    Maximum number of reads allowed to start at each nucleotide position.
+    To avoid potential PCR amplification artifacts, the maximum number of reads
+    that can start at a nucleotide position is capped at \code{capping}.
+    Capping is not applied if non-positive \code{capping} is used.
+    Default is 0 (no capping).
+}
+  \item{bgEst}{Parameter to determine background estimation approach.
+    Possible values are "matchLow" (estimation using bins with low tag counts) and
+    "rMOM" (estimation using robust method of moment (MOM)).
+    If \code{bgEst="automatic"},
+            this method tries to make the best guess for \code{bgEst},
+            based on the data provided.}
+  \item{d}{Parameter for estimating background distribution.
+    Default is 0.25. }
+  \item{signalModel}{Signal model.
+    Possible values are "BIC" (automatic model selection using BIC),
+    "1S" (one-signal-component model), and
+    "2S" (two-signal-component model). Default is "BIC". }
+  \item{maxgap}{Initial nearby peaks are merged if the distance (in bp)
+    between them is less than \code{maxgap}. Default is 200. }
+  \item{minsize}{An initial peak is removed if its width is narrower than \code{minsize}.
+    Default is 50. }
+  \item{thres}{A bin within initial peak is removed if its ChIP tag counts are less than \code{thres}.
+    Default is 10. }
+  \item{parallel}{Utilize multiple CPUs for parallel computing
+    using \code{"parallel"} package?
+    Possible values are \code{TRUE} (use multiple CPUs)
+    or \code{FALSE} (do not use multiple CPUs).
+    Default is \code{FALSE} (do not use multiple CPUs).}
+  \item{nCore}{Number of maximum number of CPUs used for the analysis.
+    Default is 8. }
+}
+\details{
+This method implements the work flow for the two-sample analysis of ChIP-seq data
+using the MOSAiCS framework (without using mappability and GC content scores).
+It imports aligned read files of ChIP and matched control samples,
+processes them into bin-level files, fits MOSAiCS model, calls peaks,
+exports the peak lists to text files, and generates reports for diagnostics.
+This method is a wrapper function of \code{constructBins}, \code{readBins},
+\code{mosaicsFit}, \code{mosaicsPeak}, \code{export} functions, and methods of
+\code{BinData}, \code{MosaicsFit}, and \code{MosaicsPeak} classes.
+
+See the vignette of the package for the illustration of the work flow
+and the description of employed methods and their options.
+Exploratory analysis plots and goodness of fit (GOF) plots are generated
+using the methods \code{plot} of the classes \code{BinData} and \code{MosaicsFit}, respectively.
+See the help of \code{constructBins} for details of the options \code{PET}, \code{chipFileFormat},
+\code{controlFileFormat}, \code{byChr}, \code{useChrfile}, \code{chrfile}, \code{excludeChr},
+\code{fragLen}, \code{binSize}, and \code{capping}.
+See the help of \code{mosaicsFit}
+for details of the options \code{bgEst} and \code{d}.
+See the help of \code{mosaicsPeak} for details of the options \code{FDR},
+\code{signalModel}, \code{maxgap}, \code{minsize}, and \code{thres}.
+See the help of \code{export} for details of the option \code{peakFileFormat}.
+
+When the data contains multiple chromosomes,
+parallel computing can be utilized for faster preprocessing and model fitting
+if \code{parallel=TRUE} and \code{parallel} package is loaded.
+\code{nCore} determines number of CPUs used for parallel computing.
+}
+\value{
+Processed bin-level files are exported to the directory specified in \code{binfileDir} argument.
+If \code{byChr=FALSE} (genome-wide analysis),
+one bin-level file is generated for each of ChIP and matched control samples,
+where file names are \code{[chipFile]_fragL[fragLen]_bin[binSize].txt}
+and \code{[controlFile]_fragL[fragLen]_bin[binSize].txt}, respectively, for SET data (\code{PET = FALSE}).
+For PET data (\code{PET = TRUE}), file names for each of ChIP and matched control samples
+are \code{[chipFile]_bin[binSize].txt} and \code{[controlFile]_bin[binSize].txt}, respectively.
+If \code{byChr=TRUE} (chromosome-wise analysis),
+bin-level files are generated for each chromosome of each of ChIP and matched control samples,
+where file names are \code{[chipFile]_fragL[fragLen]_bin[binSize]_[chrID].txt}
+and \code{[controlFile]_fragL[fragLen]_bin[binSize]_[chrID].txt}, respectively, for SET data (\code{PET = FALSE})
+(\code{[chrID]} is chromosome IDs that reads align to).
+For PET data (\code{PET = TRUE}), file names for each of ChIP and matched control samples
+are \code{[chipFile]_bin[binSize]_[chrID].txt}
+and \code{[controlFile]_bin[binSize]_[chrID].txt}, respectively.
+
+The peak list generated from the analysis are exported
+to the file with the name specified in \code{peakFile}.
+If \code{reportSummary=TRUE}, the summary of model fitting and peak calling is exported
+to the file with the name specified in \code{summaryFile} (text file).
+If \code{reportExploratory=TRUE}, the exploratory analysis plots are exported
+to the file with the name specified in \code{exploratoryFile} (PDF file).
+If \code{reportGOF=TRUE}, the goodness of fit (GOF) plots are exported
+to the file with the name specified in \code{gofFile} (PDF file).
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{constructBins}}, \code{\link{readBins}},
+\code{\link{mosaicsFit}}, \code{\link{mosaicsPeak}}, \code{\link{export}},
+\code{\linkS4class{BinData}}, \code{\linkS4class{MosaicsFit}}, \code{\linkS4class{MosaicsPeak}}.
+}
+\examples{
+\dontrun{
+# minimal input (without any reports for diagnostics)
+
+mosaicsRunAll(
+    chipFile = "/scratch/eland/STAT1_eland_results.txt",
+    chipFileFormat = "eland_result",
+    controlFile = "/scratch/eland/input_eland_results.txt",
+    controlFileFormat = "eland_result",
+    binfileDir = "/scratch/bin/",
+    peakFile = "/scratch/peak/STAT1_peak_list.bed",
+    peakFileFormat = "bed" )
+
+# generate all reports for diagnostics
+
+library(parallel)
+mosaicsRunAll(
+    chipFile = "/scratch/eland/STAT1_eland_results.txt",
+    chipFileFormat = "eland_result",
+    controlFile = "/scratch/eland/input_eland_results.txt",
+    controlFileFormat = "eland_result",
+    binfileDir = "/scratch/bin/",
+    peakFile = "/scratch/peak/STAT1_peak_list.bed",
+    peakFileFormat = "bed",
+    reportSummary = TRUE,
+    summaryFile = "/scratch/reports/mosaics_summary.txt",
+    reportExploratory = TRUE,
+    exploratoryFile = "/scratch/reports/mosaics_exploratory.pdf",
+    reportGOF = TRUE,
+    gofFile = "/scratch/reports/mosaics_GOF.pdf",
+    PET = FALSE, byChr = FALSE, useChrfile=FALSE, chrfile=NULL, excludeChr = "chrM",
+    FDR = 0.05,  fragLen = 200, capping = 0, bgEst="automatic", thres=10,
+    parallel = TRUE, nCore = 8 )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/man/readBins.Rd	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,90 @@
+\name{readBins}
+\alias{readBins}
+\title{
+Import bin-level ChIP-sep data
+}
+\description{
+Import and preprocess
+all or subset of bin-level ChIP-sep data, including ChIP data, matched control data,
+mappability score, GC content score, and sequence ambiguity score.
+}
+\usage{
+readBins( type = c("chip", "input"), fileName = NULL,
+    dataType = "unique", rounding = 100, parallel=FALSE, nCore=8 )
+}
+\arguments{
+  \item{type}{
+    Character vector indicating data types to be imported.
+    This vector can contain \code{"chip"} (ChIP data), \code{"input"} (matched control data),
+    \code{"M"} (mappability score), \code{"GC"} (GC content score), and \code{"N"} (sequence ambiguity score).
+    Currently, \code{readBins} permits only the following combinations:
+        \code{c("chip", "input")}, \code{c("chip", "input", "N")},
+        \code{c("chip", "input", "M", "GC", "N")}, and \code{c("chip", "M", "GC", "N")}.
+    Default is \code{c("chip", "input")}.
+}
+  \item{fileName}{
+    Character vector of file names, each of which matches each element of \code{type}.
+    \code{type} and \code{fileName} should have the same length and
+    corresponding elements in two vectors should appear in the same order.
+}
+  \item{dataType}{
+    How reads were processed? Possible values are
+    either \code{"unique"} (only uniquely aligned reads were retained)
+    or \code{"multi"} (reads aligned to multiple locations were also retained).
+}
+  \item{rounding}{
+    How are mappability score and GC content score rounded?
+    Default is 100 and this indicates rounding of mappability score and GC content score
+    to the nearest hundredth.
+}
+  \item{parallel}{Utilize multiple CPUs for parallel computing using \code{"paralle"} package?
+    Possible values are \code{TRUE} (use multiple CPUs)
+    or \code{FALSE} (do not use multiple CPUs).
+    Default is \code{FALSE} (do not use multiple CPUs).
+}
+  \item{nCore}{Number of CPUs when parallel computing is utilized.
+}
+}
+\details{
+Bin-level ChIP and matched control data can be generated
+from the aligned read files for your samples using the method \code{constructBins}.
+In \code{mosaics} package companion website, \url{http://www.stat.wisc.edu/~keles/Software/mosaics/},
+we provide preprocessed mappability score, GC content score,
+and sequence ambiguity score files for diverse reference genomes.
+Please check the website and the vignette for further details.
+
+The imported data type constraints the analysis that can be implemented.
+If \code{type=c("chip", "input")} or \code{c("chip", "input", "N")},
+only two-sample analysis without using mappability and GC content is allowed.
+For \code{type=c("chip", "input", "M", "GC", "N")},
+user can do the one- or two-sample analysis.
+If \code{type=c("chip", "M", "GC", "N")}, only one-sample analysis is permitted.
+See help page of \code{mosaicsFit}.
+
+When the data contains multiple chromosomes,
+parallel computing can be utilized for faster preprocessing
+if \code{parallel=TRUE} and \code{parallel} package is loaded.
+\code{nCore} determines number of CPUs used for parallel computing.
+}
+\value{
+Construct \code{BinData} class object.
+}
+\references{
+Kuan, PF, D Chung, G Pan, JA Thomson, R Stewart, and S Keles (2011),
+"A Statistical Framework for the Analysis of ChIP-Seq Data",
+\emph{Journal of the American Statistical Association}, Vol. 106, pp. 891-903.
+}
+\author{ Dongjun Chung, Pei Fen Kuan, Sunduz Keles }
+\seealso{
+\code{\link{constructBins}}, \code{\link{mosaicsFit}}, \code{\linkS4class{BinData}}.
+}
+\examples{
+\dontrun{
+library(mosaicsExample)
+exampleBinData <- readBins( type=c("chip","input"),
+    fileName=c( system.file("extdata/chip_chr21.txt", package="mosaicsExample"),
+    system.file("extdata/input_chr21.txt", package="mosaicsExample") ) )
+}
+}
+\keyword{models}
+\keyword{methods}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/src/Makevars	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,25 @@
+## Use the R_HOME indirection to support installations of multiple R version
+PKG_LIBS = $(shell $(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()" )
+
+## As an alternative, one can also add this code in a file 'configure'
+##
+##    PKG_LIBS=`${R_HOME}/bin/Rscript -e "Rcpp:::LdFlags()"`
+##
+##    sed -e "s|@PKG_LIBS@|${PKG_LIBS}|" \
+##        src/Makevars.in > src/Makevars
+##
+## which together with the following file 'src/Makevars.in'
+##
+##    PKG_LIBS = @PKG_LIBS@
+##
+## can be used to create src/Makevars dynamically. This scheme is more
+## powerful and can be expanded to also check for and link with other
+## libraries.  It should be complemented by a file 'cleanup'
+##
+##    rm src/Makevars
+##
+## which removes the autogenerated file src/Makevars.
+##
+## Of course, autoconf can also be used to write configure files. This is
+## done by a number of packages, but recommended only for more advanced users
+## comfortable with autoconf and its related tools.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/src/Makevars.win	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,4 @@
+
+## This assume that we can call Rscript to ask Rcpp about its locations
+## Use the R_HOME indirection to support installations of multiple R version
+PKG_LIBS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/src/convolution_1S.cpp	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,29 @@
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+RCPP_FUNCTION_5(NumericVector,convolution_1S_cpp,IntegerVector y,NumericVector mu,NumericVector mu_U,NumericMatrix pN,NumericVector pS) {
+
+	// result vector
+
+	NumericVector conv( y.size() );
+
+	// convolution
+	// - length of y = length of mu
+	// - rows of pN match mu_U
+
+	for ( int i=0; i<y.size(); i++ ) {
+		conv[i] = 0.0;
+		for ( int j=0; j<mu_U.size(); j++ ) {
+			if ( mu_U[j]==mu[i] ) {
+				for ( int k=0; k<=y[i]; k++ ) {
+					conv[i] += pS[(y[i]-k)] * pN(j,k);
+				}
+			}
+		}
+	}
+
+	return conv;
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mosaics/src/convolution_2S.cpp	Thu Jan 10 15:57:50 2013 -0500
@@ -0,0 +1,43 @@
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+RCPP_FUNCTION_6(NumericVector,convolution_2S_cpp,IntegerVector y,NumericVector mu,NumericVector mu_U,NumericMatrix pN,NumericVector pS1,NumericVector pS2) {
+
+	// result vector
+
+	NumericVector conv1(y.size());
+	NumericVector conv2(y.size());
+	NumericVector conv(2*y.size());
+	int conv_index = 0;
+
+	// convolution
+	// - length of y = length of mu
+	// - rows of pN match mu_U
+
+	for ( int i=0; i<y.size(); i++ ) {
+		conv1[i] = 0.0;
+		conv2[i] = 0.0;
+		for ( int j=0; j<mu_U.size(); j++ ) {
+			if ( mu_U[j]==mu[i] ) {
+				for ( int k=0; k<=y[i]; k++ ) {
+					conv1[i] += pS1[(y[i]-k)] * pN(j,k);
+					conv2[i] += pS2[(y[i]-k)] * pN(j,k);
+				}
+			}
+		}
+	}
+
+	// summarize results
+
+	for ( int i=0; i<y.size(); i++ ) {
+		conv[conv_index] = conv1[i];
+		conv_index++;
+		conv[conv_index] = conv2[i];
+		conv_index++;
+	}
+
+	return conv;
+
+}
--- a/mosaics_wrapper.pl	Thu Jan 10 15:57:23 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-# Wrapper for MOSAiCS
-# Written by Dongjun Chung, Jan. 15, 2013
-
-#!/usr/bin/env perl;
-use warnings;
-use strict;
-use File::Temp qw/tempfile/;
-use File::Temp qw/tempdir/;
-use File::Basename;
-
-# parse command arguments
-# inactive arguments: useChrfile, chrfile, excludeChr
-# meaningless argument: analysisType (for later use)
-
-die "Usage: perl mosaics_wrapper.pl [chip_path] [chip_file_format] [control_path] [control_file_format] [peak_path] [peak_file_format] [analysis_type] [report_summary_path] [report_gof_path] [report_exploratory_path] [PET?] [chromosome-wise?] [fdr_level] [frag_len] [bin_size] [capping] [signal_model] [bg_est_method] [d] [maxgap] [minsize] [thres] [parallel] [n_core]" unless @ARGV == 24;
-
-my ( $chip_path, $chip_file_format, $control_path, $control_file_format, $peak_path, $peak_file_format, $analysis_type, $report_summary_path, $report_gof_path, $report_exploratory_path, $pet, $by_chr, $fdr_level, $frag_len, $bin_size, $capping, $signal_model, $bg_est_method, $d, $maxgap, $minsize, $thres, $parallel, $n_core ) = @ARGV;
-
-# parse options: analysis type
-
-if ( $analysis_type ne "IO" ) {
-	print "Only 'IO' is supported for analysis type!\n";
-	exit 1;
-}
-
-
-# parse options: report summary
-
-my $report_summary = "FALSE";
-if ( $report_summary_path ne "None" ) {
-	$report_summary = "TRUE";
-}
-
-# parse options: report GOF
-
-my $report_gof = "FALSE";
-if ( $report_gof_path ne "None" ) {
-	$report_gof = "TRUE";
-}
-
-# parse options: report exploratory analysis
-
-my $report_exploratory = "FALSE";
-if ( $report_exploratory_path ne "None" ) {
-	$report_exploratory = "TRUE";
-}
-
-# write a R scrip to run
-
-my $tempdir_bin = tempdir();
-
-my $cmd = qq|
-	suppressPackageStartupMessages(library(mosaics))
-	try( suppressPackageStartupMessages(library(parallel)), silent=TRUE )
-
-	mosaicsRunAll(
-		chipFile="$chip_path",
-		chipFileFormat="$chip_file_format",
-		controlFile="$control_path",
-		controlFileFormat="$control_file_format",
-		binfileDir="$tempdir_bin",
-		peakFile="$peak_path",
-		peakFileFormat="$peak_file_format",
-		reportSummary=$report_summary,
-		summaryFile="$report_summary_path",
-		reportExploratory=$report_exploratory,
-		exploratoryFile="$report_exploratory_path",
-		reportGOF=$report_gof,
-		gofFile="$report_gof_path",
-		PET=$pet,
-		byChr=$by_chr,
-		useChrfile=FALSE,
-		chrfile=NULL,
-		excludeChr=NULL,
-		FDR=$fdr_level,
-		fragLen=$frag_len,
-		binSize=$bin_size,
-		capping=$capping,
-		bgEst="$bg_est_method",
-		d=$d,
-		signalModel="$signal_model",
-		maxgap=$maxgap,
-		minsize=$minsize,
-		thres=$thres,
-		parallel=$parallel,
-		nCore=$n_core )
-
-	q()
-	|;
-
-# run R
-
-open( FT, "| R --slave --vanilla >& /dev/null" ) or die "Couldn't call R!\n";
-print FT $cmd, "\n";
-close FT or die "Couldn't finish R!\n";
-
-exit;
-
-