diff carpet-src-1/tools/CARPET/norm_rep.xml @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/carpet-src-1/tools/CARPET/norm_rep.xml	Tue Jun 07 16:50:41 2011 -0400
@@ -0,0 +1,310 @@
+<tool id="normalization" name="PreProcess for Tiling" version="1.0.0">
+  <description>normalizing data</description>
+  <command interpreter="bash">r_wrapper2.sh $script_file</command>
+
+  <inputs>
+         <param name="type" type="select" label="Normalization">
+            <option value="bwm" selected="true">Bi-weight function</option>
+            <option value="quantile">Quantile</option>
+            <option value="none">None</option>
+         </param>
+         <param name="sum" type="select" label="Summarization">
+            <option value="mean" selected="true">Mean</option>
+            <option value="median">Median</option>
+            <option value="none">None</option>
+         </param>
+         <repeat name="series" title="Chip">
+            <param name="input" type="data" format="tabular" label="Dataset"/>
+            <param name="header" type="select" label="Headers">
+               <option value="T" selected="true">TRUE</option>
+               <option value="F">FALSE</option>
+            </param>
+            <param name="chrom_col" type="data_column" data_ref="input" label="Column for chr value (chr1,etc)"/>
+            <param name="start_col" type="data_column" data_ref="input" label="Column for start position"/>
+            
+           <conditional name="fine_col">
+               <param name="si_o_no" type="select" label="End column">
+               	  <option value="si_ce" selected="true">End column present</option>
+                  <option value="no_ce">End column NOT present</option>
+               </param>
+                <when value="si_ce">
+                   <param name="end_col" type="data_column" data_ref="input" label="Column for end position"/>
+               </when>
+               <when value="no_ce">
+                    <param name="end_col" type="text" value="50" size="4" label="average length of the probes"/>
+               </when>
+            </conditional>
+            
+            <conditional name="data">
+                 <param name="data_type" type="select" label="Data type">
+                     <option value="log" selected="true">log2(ratio)</option>
+                     <option value="no_log">one color raw data</option>
+                     <option value="raw">Cy3-Cy5 raw data</option>
+                 </param>
+                 <when value="log">
+                   <param name="value_col" type="data_column" data_ref="input" label="Column for log2(ratio)"/>
+                   <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>  
+                   <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>  
+                 </when>
+                 <when value="no_log">
+                   <param name="value_col" type="data_column" data_ref="input" label="Column for raw data"/>
+                   <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>  
+                   <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>  
+                 </when>
+                 <when value="raw">
+                   <param name="value_col" type="text" value="NOT-NEEDED" size="12" label="Column for log2(ratio)"/>
+                   <param name="value_col_cy3" type="data_column" data_ref="input" label="Column for Cy3"/>  
+                   <param name="value_col_cy5" type="data_column" data_ref="input" label="Column for Cy5"/>  
+                 </when>
+            </conditional>
+            <param name="col" type="select" label="Line Color">
+                <option value="1">Black</option>
+                <option value="2">Red</option>
+                <option value="3">Green</option>
+                <option value="4">Blue</option>
+                <option value="5">Cyan</option>
+                <option value="6">Magenta</option>
+                <option value="7">Yellow</option>
+                <option value="8">Gray</option>
+            </param>
+         </repeat>       
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      options( show.error.messages=F, 
+               error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+      ## Determine range of all series in the plot
+      options(scipen=999)
+      ciccioo=library(Ringo)
+      pdf( "${out_file1}" )
+      xrange = c( NULL, NULL )
+      xrange_norm = c( NULL, NULL )
+      #for $i, $s in enumerate( $series )
+        s${i} = read.table( "${s.input.file_name}",sep="\t",header=$s.header)
+        #if $i == 0
+          firma=matrix(c("GALAXY","CARPET"),length(s${i}[,${s.chrom_col}]),2,byrow=T)
+          fine=matrix(c(".",".","Cesaroni_et_al."),length(s${i}[,${s.chrom_col}]),3,byrow=T)
+          	
+          	 if ("${s.fine_col.si_o_no}"== "no_ce"){
+                coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],as.numeric(s${i}[,${s.start_col}])+${s.fine_col.end_col})
+             }
+             if ("${s.fine_col.si_o_no}"== "si_ce"){
+                 coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],s${i}[,${s.fine_col.end_col}])
+             }
+             if ("${s.data.data_type}" == "raw") {
+                 totali=log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}]))
+             }
+             if ("${s.data.data_type}" == "log") {
+                 totali=s${i}[,${s.data.value_col}]
+             }
+             if ("${s.data.data_type}" == "no_log") {
+              totali=log2(as.numeric(s${i}[,${s.data.value_col}]))
+             }
+             
+        #elif $i >0
+            if ("${s.data.data_type}" == "raw") {
+               totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}])))
+            }
+            if ("${s.data.data_type}" == "log") {
+               totali=cbind(totali,s${i}[,${s.data.value_col}])
+            }
+            if ("${s.data.data_type}" == "no_log") {
+              totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col}])))
+            }
+        #end if
+      #end for
+      
+     
+      
+      print (paste("number of chips =",$i+1,sep=" "),quote=F)
+      tukey.biweight = function(x, c = 5, epsilon = 1e-04) {
+      m = median(x)
+      s = median(abs(x - m))
+      u = (x - m)/(c * s + epsilon)
+      w = rep(0, length(x))
+      ii = abs(u) &lt;= 1
+      w[ii] = ((1 - u^2)^2)[ii]
+      t.bi = sum(w * x)/sum(w)
+      return(t.bi)
+      }
+      totali=as.data.frame(totali)
+      if ("${type}" == "bwm"){
+        totali.tbw = apply(totali, 2, tukey.biweight)
+        totali_norm = totali - matrix(totali.tbw, nrow = nrow(totali), ncol = ncol(totali), byrow = TRUE)
+        for (i in 1:length(totali.tbw)){
+            print(paste("bi-weight_mean rep",i,"=",format(totali.tbw[i],digits=3),sep=" "),quote=F)
+        }
+      }
+      if ("${type}" == "quantile"){
+      	if (length(totali) == 1) {
+      		print ("Quantile normalization is not feasible with one sample",quote=F)
+      		q()
+      	}
+      	totali_norm=normalizeBetweenArrays(as.matrix(totali), method="quantile")
+      }
+      if ("${type}" == "none"){
+        totali_norm=totali
+      }
+      
+      for (j in 1:length(as.data.frame(totali_norm)))
+          xrange_norm=range(totali_norm[,j],xrange_norm)
+
+      for (jj in 1:length(totali))
+          xrange=range(totali[,jj],xrange)
+          
+      plot( NULL, type="n", xlim=xrange, ylim=c(0,1.2), main="Raw signal distribution", xlab="log2(ratio)",ylab="Density")
+      ## Plot each series
+      #for $i, $s in enumerate( $series )
+        lines(density(totali[,${i}+1]), col="${s.col}" )
+        #if $i == 0
+         colori="${s.col}"
+        #elif $i >0
+         colori=rbind(colori,"${s.col}")
+        #end if
+      #end for 
+      legend((xrange[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),sep="_"))
+
+      
+      plot( NULL, type="n", xlim=xrange_norm, ylim=c(0,1.2), main="Normalized signal distribution", xlab="log2(ratio)",ylab="Density")
+      ## Plot each series
+      #for $i, $s in enumerate( $series )
+        lines(density(totali_norm[,${i}+1]), col="${s.col}" )
+      #end for 
+      legend((xrange_norm[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),"norm",sep="_"))
+
+      
+      
+      if (${i} > 0){
+      corPlot(as.matrix(totali_norm),grouping=paste("rep",c(1:(${i}+1)),"norm",sep="_"))
+      }
+      devname = dev.off() 
+      totali_norm=as.data.frame(totali_norm)
+      if ("${sum}" == "mean"){
+        total_sum=apply(totali_norm,1,mean)
+      }
+      if ("${sum}" == "median"){
+        total_sum=apply(totali_norm,1,median)
+      }
+      if ("${sum}" == "none"){
+        total_sum=totali_norm
+      }
+      total_sum=round(total_sum,digits=3)
+      total_gff=cbind(coord_gff,total_sum,fine)
+      cazzolina=sub("CHR","chr",total_gff[,1])
+      total_gff[,1]=as.vector(cazzolina)
+      write.table(total_gff,"${out_file2}",sep="\t",quote=F,col.names=F,row.names=F)
+      
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="pdf" name="out_file1" />
+    <data format="tabular" name="out_file2" />
+  </outputs>
+
+<help>
+ .. class:: infomark
+
+**What it does**
+
+PPT normalizes single ChIP-chip or multi ChIP-chip experiments.
+PPT also compares the correlation between replicates and produces different plot to better understand the goodness of the experiment and creates a GFF file suitable for PeakPicker analysis.
+
+PLEASE, for more detailed information refer to the CARPET user Manual:
+click to download_ it.
+
+.. _download: /static/example_file/CARPET_userManual.zip
+
+--------
+
+**Parameters:**
+
+- **Normalization:** 
+	- **Bi-weight function:** bi-weight function is used to scale all the chips (Standard Nimblegen normalization).
+	- **Quantile:** quantile normalization is performed between all the chips.
+	- **None:** no normalization is performed.
+	
+- **Summarization:** 
+	- **Mean:** the final value of each probe is the mean between all the chips.
+	- **Median:** the final value of each probe is the median between all the chips.
+	- **None:** all the values of each probe are given back.
+- **Chips:** 
+	- **Dataset:** input data file.
+	- **Headers:** if headers are present or not in the dataset file.
+	- **Column for chr value:** the column with the probe Chromosome numbers.
+	- **Column for start position:** the column with the probe start positions.
+	- **End column:** if the end position of the probes is present or not.
+	- **Column for end position:** the column with the probe end positions.
+	- **average length of the probes:** the average length of the probes (only for custom chip).
+	- **Data type:** choose between log2(ratio) or raw value (NOT log trasformaed) or Cy3-Cy5 raw values according to data format.
+	- **Column for log2(ratio):** the column with probe log2(ratio) values.
+	- **Column for raw data:** the column with probe raw values (NOT log trasformed).
+	- **Column for Cy3:** the column with probe Cy3 raw value.
+	- **Column for Cy5:** the column with probe Cy5 raw value.
+	- **Line Color:** the line colors for graphs create by the script.
+
+
+
+-----
+
+.. class:: warningmark
+
+This tool requires at least the following fields in each file or dataset: 
+	- Chromosome number in this format : chr1 , chr2, etc etc.
+	- Start position
+	- one column with log2(ratio) or two columns with Cy3 and Cy5 raw values
+
+--------
+
+
+
+**INPUT FILE**
+
+This tool accepts any kind of file, with at least the fields described above.
+
+Click here (pair_file_) to download a Cy3-Cy5 pair file example.
+
+.. _pair_file: /static/example_file/all_pair.txt.zip
+
+Click here (raw_value_file_) to download an one color example.
+
+.. _raw_value_file: /static/example_file/raw_value.txt.zip
+
+Click here (GFF_file_) to download a GFF log2(ratio) file example.
+
+.. _GFF_file: /static/example_file/log2ratio_file.txt.zip
+
+
+---------
+
+.. class:: infomark
+
+**How does it work?**
+
+For each chip the log2 of Cy5/Cy3 is calculated (if not already present).
+All the chips are then normalized, according to the type of normalization selected.
+
+ - **bi-weight** procedure scales all the probe log2ratio to center the data around zero. Scaling is performed by subtracting the bi-weight mean for the log2(ratio) values for all features on the array from each log2-ratio value.
+ - **quantile** procedure normalizes the ditributions of the probe log2ratio of each chip with a quantile normalization.
+ 
+Moreover, the correlations between chips are calculated and graphs are produced as shown in the following figures.
+
+.. image:: static/images/CARPET/distribution.png
+
+.. image:: static/images/CARPET/correlation.png
+
+The first two graphs are produced using the density function implemented in R. 
+The last graph is produced using the corPlot function implemented in Ringo package.
+(The last graph is created only if more than one chip is uploaded.)
+
+
+**OUTPUT FILE**
+
+- If a summarization method is selected or only one chip is uploaded, a GFF file (ready to be used with PeakPicker) is created.
+- if NO summarization methods are selected and more than one file is uploaded, the output will be like in the table below:
+ 
+ .. image:: static/images/CARPET/output_no_sum.png
+</help>
+</tool>