Mercurial > repos > matces > carpet_toolsuite
diff carpet-src-1/tools/CARPET/norm_rep.xml @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/carpet-src-1/tools/CARPET/norm_rep.xml Tue Jun 07 16:50:41 2011 -0400 @@ -0,0 +1,310 @@ +<tool id="normalization" name="PreProcess for Tiling" version="1.0.0"> + <description>normalizing data</description> + <command interpreter="bash">r_wrapper2.sh $script_file</command> + + <inputs> + <param name="type" type="select" label="Normalization"> + <option value="bwm" selected="true">Bi-weight function</option> + <option value="quantile">Quantile</option> + <option value="none">None</option> + </param> + <param name="sum" type="select" label="Summarization"> + <option value="mean" selected="true">Mean</option> + <option value="median">Median</option> + <option value="none">None</option> + </param> + <repeat name="series" title="Chip"> + <param name="input" type="data" format="tabular" label="Dataset"/> + <param name="header" type="select" label="Headers"> + <option value="T" selected="true">TRUE</option> + <option value="F">FALSE</option> + </param> + <param name="chrom_col" type="data_column" data_ref="input" label="Column for chr value (chr1,etc)"/> + <param name="start_col" type="data_column" data_ref="input" label="Column for start position"/> + + <conditional name="fine_col"> + <param name="si_o_no" type="select" label="End column"> + <option value="si_ce" selected="true">End column present</option> + <option value="no_ce">End column NOT present</option> + </param> + <when value="si_ce"> + <param name="end_col" type="data_column" data_ref="input" label="Column for end position"/> + </when> + <when value="no_ce"> + <param name="end_col" type="text" value="50" size="4" label="average length of the probes"/> + </when> + </conditional> + + <conditional name="data"> + <param name="data_type" type="select" label="Data type"> + <option value="log" selected="true">log2(ratio)</option> + <option value="no_log">one color raw data</option> + <option value="raw">Cy3-Cy5 raw data</option> + </param> + <when value="log"> + <param name="value_col" type="data_column" data_ref="input" label="Column for log2(ratio)"/> + <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/> + <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/> + </when> + <when value="no_log"> + <param name="value_col" type="data_column" data_ref="input" label="Column for raw data"/> + <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/> + <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/> + </when> + <when value="raw"> + <param name="value_col" type="text" value="NOT-NEEDED" size="12" label="Column for log2(ratio)"/> + <param name="value_col_cy3" type="data_column" data_ref="input" label="Column for Cy3"/> + <param name="value_col_cy5" type="data_column" data_ref="input" label="Column for Cy5"/> + </when> + </conditional> + <param name="col" type="select" label="Line Color"> + <option value="1">Black</option> + <option value="2">Red</option> + <option value="3">Green</option> + <option value="4">Blue</option> + <option value="5">Cyan</option> + <option value="6">Magenta</option> + <option value="7">Yellow</option> + <option value="8">Gray</option> + </param> + </repeat> + </inputs> + + <configfiles> + <configfile name="script_file"> + ## Setup R error handling to go to stderr + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + ## Determine range of all series in the plot + options(scipen=999) + ciccioo=library(Ringo) + pdf( "${out_file1}" ) + xrange = c( NULL, NULL ) + xrange_norm = c( NULL, NULL ) + #for $i, $s in enumerate( $series ) + s${i} = read.table( "${s.input.file_name}",sep="\t",header=$s.header) + #if $i == 0 + firma=matrix(c("GALAXY","CARPET"),length(s${i}[,${s.chrom_col}]),2,byrow=T) + fine=matrix(c(".",".","Cesaroni_et_al."),length(s${i}[,${s.chrom_col}]),3,byrow=T) + + if ("${s.fine_col.si_o_no}"== "no_ce"){ + coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],as.numeric(s${i}[,${s.start_col}])+${s.fine_col.end_col}) + } + if ("${s.fine_col.si_o_no}"== "si_ce"){ + coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],s${i}[,${s.fine_col.end_col}]) + } + if ("${s.data.data_type}" == "raw") { + totali=log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}])) + } + if ("${s.data.data_type}" == "log") { + totali=s${i}[,${s.data.value_col}] + } + if ("${s.data.data_type}" == "no_log") { + totali=log2(as.numeric(s${i}[,${s.data.value_col}])) + } + + #elif $i >0 + if ("${s.data.data_type}" == "raw") { + totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}]))) + } + if ("${s.data.data_type}" == "log") { + totali=cbind(totali,s${i}[,${s.data.value_col}]) + } + if ("${s.data.data_type}" == "no_log") { + totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col}]))) + } + #end if + #end for + + + + print (paste("number of chips =",$i+1,sep=" "),quote=F) + tukey.biweight = function(x, c = 5, epsilon = 1e-04) { + m = median(x) + s = median(abs(x - m)) + u = (x - m)/(c * s + epsilon) + w = rep(0, length(x)) + ii = abs(u) <= 1 + w[ii] = ((1 - u^2)^2)[ii] + t.bi = sum(w * x)/sum(w) + return(t.bi) + } + totali=as.data.frame(totali) + if ("${type}" == "bwm"){ + totali.tbw = apply(totali, 2, tukey.biweight) + totali_norm = totali - matrix(totali.tbw, nrow = nrow(totali), ncol = ncol(totali), byrow = TRUE) + for (i in 1:length(totali.tbw)){ + print(paste("bi-weight_mean rep",i,"=",format(totali.tbw[i],digits=3),sep=" "),quote=F) + } + } + if ("${type}" == "quantile"){ + if (length(totali) == 1) { + print ("Quantile normalization is not feasible with one sample",quote=F) + q() + } + totali_norm=normalizeBetweenArrays(as.matrix(totali), method="quantile") + } + if ("${type}" == "none"){ + totali_norm=totali + } + + for (j in 1:length(as.data.frame(totali_norm))) + xrange_norm=range(totali_norm[,j],xrange_norm) + + for (jj in 1:length(totali)) + xrange=range(totali[,jj],xrange) + + plot( NULL, type="n", xlim=xrange, ylim=c(0,1.2), main="Raw signal distribution", xlab="log2(ratio)",ylab="Density") + ## Plot each series + #for $i, $s in enumerate( $series ) + lines(density(totali[,${i}+1]), col="${s.col}" ) + #if $i == 0 + colori="${s.col}" + #elif $i >0 + colori=rbind(colori,"${s.col}") + #end if + #end for + legend((xrange[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),sep="_")) + + + plot( NULL, type="n", xlim=xrange_norm, ylim=c(0,1.2), main="Normalized signal distribution", xlab="log2(ratio)",ylab="Density") + ## Plot each series + #for $i, $s in enumerate( $series ) + lines(density(totali_norm[,${i}+1]), col="${s.col}" ) + #end for + legend((xrange_norm[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),"norm",sep="_")) + + + + if (${i} > 0){ + corPlot(as.matrix(totali_norm),grouping=paste("rep",c(1:(${i}+1)),"norm",sep="_")) + } + devname = dev.off() + totali_norm=as.data.frame(totali_norm) + if ("${sum}" == "mean"){ + total_sum=apply(totali_norm,1,mean) + } + if ("${sum}" == "median"){ + total_sum=apply(totali_norm,1,median) + } + if ("${sum}" == "none"){ + total_sum=totali_norm + } + total_sum=round(total_sum,digits=3) + total_gff=cbind(coord_gff,total_sum,fine) + cazzolina=sub("CHR","chr",total_gff[,1]) + total_gff[,1]=as.vector(cazzolina) + write.table(total_gff,"${out_file2}",sep="\t",quote=F,col.names=F,row.names=F) + + </configfile> + </configfiles> + + <outputs> + <data format="pdf" name="out_file1" /> + <data format="tabular" name="out_file2" /> + </outputs> + +<help> + .. class:: infomark + +**What it does** + +PPT normalizes single ChIP-chip or multi ChIP-chip experiments. +PPT also compares the correlation between replicates and produces different plot to better understand the goodness of the experiment and creates a GFF file suitable for PeakPicker analysis. + +PLEASE, for more detailed information refer to the CARPET user Manual: +click to download_ it. + +.. _download: /static/example_file/CARPET_userManual.zip + +-------- + +**Parameters:** + +- **Normalization:** + - **Bi-weight function:** bi-weight function is used to scale all the chips (Standard Nimblegen normalization). + - **Quantile:** quantile normalization is performed between all the chips. + - **None:** no normalization is performed. + +- **Summarization:** + - **Mean:** the final value of each probe is the mean between all the chips. + - **Median:** the final value of each probe is the median between all the chips. + - **None:** all the values of each probe are given back. +- **Chips:** + - **Dataset:** input data file. + - **Headers:** if headers are present or not in the dataset file. + - **Column for chr value:** the column with the probe Chromosome numbers. + - **Column for start position:** the column with the probe start positions. + - **End column:** if the end position of the probes is present or not. + - **Column for end position:** the column with the probe end positions. + - **average length of the probes:** the average length of the probes (only for custom chip). + - **Data type:** choose between log2(ratio) or raw value (NOT log trasformaed) or Cy3-Cy5 raw values according to data format. + - **Column for log2(ratio):** the column with probe log2(ratio) values. + - **Column for raw data:** the column with probe raw values (NOT log trasformed). + - **Column for Cy3:** the column with probe Cy3 raw value. + - **Column for Cy5:** the column with probe Cy5 raw value. + - **Line Color:** the line colors for graphs create by the script. + + + +----- + +.. class:: warningmark + +This tool requires at least the following fields in each file or dataset: + - Chromosome number in this format : chr1 , chr2, etc etc. + - Start position + - one column with log2(ratio) or two columns with Cy3 and Cy5 raw values + +-------- + + + +**INPUT FILE** + +This tool accepts any kind of file, with at least the fields described above. + +Click here (pair_file_) to download a Cy3-Cy5 pair file example. + +.. _pair_file: /static/example_file/all_pair.txt.zip + +Click here (raw_value_file_) to download an one color example. + +.. _raw_value_file: /static/example_file/raw_value.txt.zip + +Click here (GFF_file_) to download a GFF log2(ratio) file example. + +.. _GFF_file: /static/example_file/log2ratio_file.txt.zip + + +--------- + +.. class:: infomark + +**How does it work?** + +For each chip the log2 of Cy5/Cy3 is calculated (if not already present). +All the chips are then normalized, according to the type of normalization selected. + + - **bi-weight** procedure scales all the probe log2ratio to center the data around zero. Scaling is performed by subtracting the bi-weight mean for the log2(ratio) values for all features on the array from each log2-ratio value. + - **quantile** procedure normalizes the ditributions of the probe log2ratio of each chip with a quantile normalization. + +Moreover, the correlations between chips are calculated and graphs are produced as shown in the following figures. + +.. image:: static/images/CARPET/distribution.png + +.. image:: static/images/CARPET/correlation.png + +The first two graphs are produced using the density function implemented in R. +The last graph is produced using the corPlot function implemented in Ringo package. +(The last graph is created only if more than one chip is uploaded.) + + +**OUTPUT FILE** + +- If a summarization method is selected or only one chip is uploaded, a GFF file (ready to be used with PeakPicker) is created. +- if NO summarization methods are selected and more than one file is uploaded, the output will be like in the table below: + + .. image:: static/images/CARPET/output_no_sum.png +</help> +</tool>