Mercurial > repos > matces > carpet_toolsuite

<tool id="normalization" name="PreProcess for Tiling" version="1.0.0">
  <description>normalizing data</description>
  <command interpreter="bash">r_wrapper2.sh $script_file</command>

  <inputs>
         <param name="type" type="select" label="Normalization">
            <option value="bwm" selected="true">Bi-weight function</option>
            <option value="quantile">Quantile</option>
            <option value="none">None</option>
         </param>
         <param name="sum" type="select" label="Summarization">
            <option value="mean" selected="true">Mean</option>
            <option value="median">Median</option>
            <option value="none">None</option>
         </param>
         <repeat name="series" title="Chip">
            <param name="input" type="data" format="tabular" label="Dataset"/>
            <param name="header" type="select" label="Headers">
               <option value="T" selected="true">TRUE</option>
               <option value="F">FALSE</option>
            </param>
            <param name="chrom_col" type="data_column" data_ref="input" label="Column for chr value (chr1,etc)"/>
            <param name="start_col" type="data_column" data_ref="input" label="Column for start position"/>

           <conditional name="fine_col">
               <param name="si_o_no" type="select" label="End column">
               	  <option value="si_ce" selected="true">End column present</option>
                  <option value="no_ce">End column NOT present</option>
               </param>
                <when value="si_ce">
                   <param name="end_col" type="data_column" data_ref="input" label="Column for end position"/>
               </when>
               <when value="no_ce">
                    <param name="end_col" type="text" value="50" size="4" label="average length of the probes"/>
               </when>
            </conditional>

            <conditional name="data">
                 <param name="data_type" type="select" label="Data type">
                     <option value="log" selected="true">log2(ratio)</option>
                     <option value="no_log">one color raw data</option>
                     <option value="raw">Cy3-Cy5 raw data</option>
                 </param>
                 <when value="log">
                   <param name="value_col" type="data_column" data_ref="input" label="Column for log2(ratio)"/>
                   <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>
                   <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>
                 </when>
                 <when value="no_log">
                   <param name="value_col" type="data_column" data_ref="input" label="Column for raw data"/>
                   <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>
                   <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>
                 </when>
                 <when value="raw">
                   <param name="value_col" type="text" value="NOT-NEEDED" size="12" label="Column for log2(ratio)"/>
                   <param name="value_col_cy3" type="data_column" data_ref="input" label="Column for Cy3"/>
                   <param name="value_col_cy5" type="data_column" data_ref="input" label="Column for Cy5"/>
                 </when>
            </conditional>
            <param name="col" type="select" label="Line Color">
                <option value="1">Black</option>
                <option value="2">Red</option>
                <option value="3">Green</option>
                <option value="4">Blue</option>
                <option value="5">Cyan</option>
                <option value="6">Magenta</option>
                <option value="7">Yellow</option>
                <option value="8">Gray</option>
            </param>
         </repeat>
  </inputs>

  <configfiles>
    <configfile name="script_file">
      ## Setup R error handling to go to stderr
      options( show.error.messages=F,
               error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
      ## Determine range of all series in the plot
      options(scipen=999)
      ciccioo=library(Ringo)
      pdf( "${out_file1}" )
      xrange = c( NULL, NULL )
      xrange_norm = c( NULL, NULL )
      #for $i, $s in enumerate( $series )
        s${i} = read.table( "${s.input.file_name}",sep="\t",header=$s.header)
        #if $i == 0
          firma=matrix(c("GALAXY","CARPET"),length(s${i}[,${s.chrom_col}]),2,byrow=T)
          fine=matrix(c(".",".","Cesaroni_et_al."),length(s${i}[,${s.chrom_col}]),3,byrow=T)

          	 if ("${s.fine_col.si_o_no}"== "no_ce"){
                coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],as.numeric(s${i}[,${s.start_col}])+${s.fine_col.end_col})
             }
             if ("${s.fine_col.si_o_no}"== "si_ce"){
                 coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],s${i}[,${s.fine_col.end_col}])
             }
             if ("${s.data.data_type}" == "raw") {
                 totali=log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}]))
             }
             if ("${s.data.data_type}" == "log") {
                 totali=s${i}[,${s.data.value_col}]
             }
             if ("${s.data.data_type}" == "no_log") {
              totali=log2(as.numeric(s${i}[,${s.data.value_col}]))
             }

        #elif $i >0
            if ("${s.data.data_type}" == "raw") {
               totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}])))
            }
            if ("${s.data.data_type}" == "log") {
               totali=cbind(totali,s${i}[,${s.data.value_col}])
            }
            if ("${s.data.data_type}" == "no_log") {
              totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col}])))
            }
        #end if
      #end for


      print (paste("number of chips =",$i+1,sep=" "),quote=F)
      tukey.biweight = function(x, c = 5, epsilon = 1e-04) {
      m = median(x)
      s = median(abs(x - m))
      u = (x - m)/(c * s + epsilon)
      w = rep(0, length(x))
      ii = abs(u) &lt;= 1
      w[ii] = ((1 - u^2)^2)[ii]
      t.bi = sum(w * x)/sum(w)
      return(t.bi)
      }
      totali=as.data.frame(totali)
      if ("${type}" == "bwm"){
        totali.tbw = apply(totali, 2, tukey.biweight)
        totali_norm = totali - matrix(totali.tbw, nrow = nrow(totali), ncol = ncol(totali), byrow = TRUE)
        for (i in 1:length(totali.tbw)){
            print(paste("bi-weight_mean rep",i,"=",format(totali.tbw[i],digits=3),sep=" "),quote=F)
        }
      }
      if ("${type}" == "quantile"){
      	if (length(totali) == 1) {
      		print ("Quantile normalization is not feasible with one sample",quote=F)
      		q()
      	}
      	totali_norm=normalizeBetweenArrays(as.matrix(totali), method="quantile")
      }
      if ("${type}" == "none"){
        totali_norm=totali
      }

      for (j in 1:length(as.data.frame(totali_norm)))
          xrange_norm=range(totali_norm[,j],xrange_norm)

      for (jj in 1:length(totali))
          xrange=range(totali[,jj],xrange)

      plot( NULL, type="n", xlim=xrange, ylim=c(0,1.2), main="Raw signal distribution", xlab="log2(ratio)",ylab="Density")
      ## Plot each series
      #for $i, $s in enumerate( $series )
        lines(density(totali[,${i}+1]), col="${s.col}" )
        #if $i == 0
         colori="${s.col}"
        #elif $i >0
         colori=rbind(colori,"${s.col}")
        #end if
      #end for
      legend((xrange[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),sep="_"))


      plot( NULL, type="n", xlim=xrange_norm, ylim=c(0,1.2), main="Normalized signal distribution", xlab="log2(ratio)",ylab="Density")
      ## Plot each series
      #for $i, $s in enumerate( $series )
        lines(density(totali_norm[,${i}+1]), col="${s.col}" )
      #end for
      legend((xrange_norm[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),"norm",sep="_"))


      if (${i} > 0){
      corPlot(as.matrix(totali_norm),grouping=paste("rep",c(1:(${i}+1)),"norm",sep="_"))
      }
      devname = dev.off()
      totali_norm=as.data.frame(totali_norm)
      if ("${sum}" == "mean"){
        total_sum=apply(totali_norm,1,mean)
      }
      if ("${sum}" == "median"){
        total_sum=apply(totali_norm,1,median)
      }
      if ("${sum}" == "none"){
        total_sum=totali_norm
      }
      total_sum=round(total_sum,digits=3)
      total_gff=cbind(coord_gff,total_sum,fine)
      cazzolina=sub("CHR","chr",total_gff[,1])
      total_gff[,1]=as.vector(cazzolina)
      write.table(total_gff,"${out_file2}",sep="\t",quote=F,col.names=F,row.names=F)

    </configfile>
  </configfiles>

  <outputs>
    <data format="pdf" name="out_file1" />
    <data format="tabular" name="out_file2" />
  </outputs>

<help>
 .. class:: infomark

**What it does**

PPT normalizes single ChIP-chip or multi ChIP-chip experiments.
PPT also compares the correlation between replicates and produces different plot to better understand the goodness of the experiment and creates a GFF file suitable for PeakPicker analysis.

PLEASE, for more detailed information refer to the CARPET user Manual:
click to download_ it.

.. _download: /static/example_file/CARPET_userManual.zip

--------

**Parameters:**

- **Normalization:**
	- **Bi-weight function:** bi-weight function is used to scale all the chips (Standard Nimblegen normalization).
	- **Quantile:** quantile normalization is performed between all the chips.
	- **None:** no normalization is performed.

- **Summarization:**
	- **Mean:** the final value of each probe is the mean between all the chips.
	- **Median:** the final value of each probe is the median between all the chips.
	- **None:** all the values of each probe are given back.
- **Chips:**
	- **Dataset:** input data file.
	- **Headers:** if headers are present or not in the dataset file.
	- **Column for chr value:** the column with the probe Chromosome numbers.
	- **Column for start position:** the column with the probe start positions.
	- **End column:** if the end position of the probes is present or not.
	- **Column for end position:** the column with the probe end positions.
	- **average length of the probes:** the average length of the probes (only for custom chip).
	- **Data type:** choose between log2(ratio) or raw value (NOT log trasformaed) or Cy3-Cy5 raw values according to data format.
	- **Column for log2(ratio):** the column with probe log2(ratio) values.
	- **Column for raw data:** the column with probe raw values (NOT log trasformed).
	- **Column for Cy3:** the column with probe Cy3 raw value.
	- **Column for Cy5:** the column with probe Cy5 raw value.
	- **Line Color:** the line colors for graphs create by the script.


-----

.. class:: warningmark

This tool requires at least the following fields in each file or dataset:
	- Chromosome number in this format : chr1 , chr2, etc etc.
	- Start position
	- one column with log2(ratio) or two columns with Cy3 and Cy5 raw values

--------


**INPUT FILE**

This tool accepts any kind of file, with at least the fields described above.

Click here (pair_file_) to download a Cy3-Cy5 pair file example.

.. _pair_file: /static/example_file/all_pair.txt.zip

Click here (raw_value_file_) to download an one color example.

.. _raw_value_file: /static/example_file/raw_value.txt.zip

Click here (GFF_file_) to download a GFF log2(ratio) file example.

.. _GFF_file: /static/example_file/log2ratio_file.txt.zip


---------

.. class:: infomark

**How does it work?**

For each chip the log2 of Cy5/Cy3 is calculated (if not already present).
All the chips are then normalized, according to the type of normalization selected.

 - **bi-weight** procedure scales all the probe log2ratio to center the data around zero. Scaling is performed by subtracting the bi-weight mean for the log2(ratio) values for all features on the array from each log2-ratio value.
 - **quantile** procedure normalizes the ditributions of the probe log2ratio of each chip with a quantile normalization.

Moreover, the correlations between chips are calculated and graphs are produced as shown in the following figures.

.. image:: static/images/CARPET/distribution.png

.. image:: static/images/CARPET/correlation.png

The first two graphs are produced using the density function implemented in R.
The last graph is produced using the corPlot function implemented in Ringo package.
(The last graph is created only if more than one chip is uploaded.)


**OUTPUT FILE**

- If a summarization method is selected or only one chip is uploaded, a GFF file (ready to be used with PeakPicker) is created.
- if NO summarization methods are selected and more than one file is uploaded, the output will be like in the table below:

 .. image:: static/images/CARPET/output_no_sum.png
</help>
</tool>
author	matces
date	Tue, 07 Jun 2011 16:50:41 -0400
parents
children