comparison carpet-src-1/tools/CARPET/norm_rep.xml @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:cdd489d98766
1 <tool id="normalization" name="PreProcess for Tiling" version="1.0.0">
2 <description>normalizing data</description>
3 <command interpreter="bash">r_wrapper2.sh $script_file</command>
4
5 <inputs>
6 <param name="type" type="select" label="Normalization">
7 <option value="bwm" selected="true">Bi-weight function</option>
8 <option value="quantile">Quantile</option>
9 <option value="none">None</option>
10 </param>
11 <param name="sum" type="select" label="Summarization">
12 <option value="mean" selected="true">Mean</option>
13 <option value="median">Median</option>
14 <option value="none">None</option>
15 </param>
16 <repeat name="series" title="Chip">
17 <param name="input" type="data" format="tabular" label="Dataset"/>
18 <param name="header" type="select" label="Headers">
19 <option value="T" selected="true">TRUE</option>
20 <option value="F">FALSE</option>
21 </param>
22 <param name="chrom_col" type="data_column" data_ref="input" label="Column for chr value (chr1,etc)"/>
23 <param name="start_col" type="data_column" data_ref="input" label="Column for start position"/>
24
25 <conditional name="fine_col">
26 <param name="si_o_no" type="select" label="End column">
27 <option value="si_ce" selected="true">End column present</option>
28 <option value="no_ce">End column NOT present</option>
29 </param>
30 <when value="si_ce">
31 <param name="end_col" type="data_column" data_ref="input" label="Column for end position"/>
32 </when>
33 <when value="no_ce">
34 <param name="end_col" type="text" value="50" size="4" label="average length of the probes"/>
35 </when>
36 </conditional>
37
38 <conditional name="data">
39 <param name="data_type" type="select" label="Data type">
40 <option value="log" selected="true">log2(ratio)</option>
41 <option value="no_log">one color raw data</option>
42 <option value="raw">Cy3-Cy5 raw data</option>
43 </param>
44 <when value="log">
45 <param name="value_col" type="data_column" data_ref="input" label="Column for log2(ratio)"/>
46 <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>
47 <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>
48 </when>
49 <when value="no_log">
50 <param name="value_col" type="data_column" data_ref="input" label="Column for raw data"/>
51 <param name="value_col_cy3" type="text" value="NOT-NEEDED" size="12" label="Column for Cy3"/>
52 <param name="value_col_cy5" type="text" value="NOT-NEEDED" size="12" label="Column for Cy5"/>
53 </when>
54 <when value="raw">
55 <param name="value_col" type="text" value="NOT-NEEDED" size="12" label="Column for log2(ratio)"/>
56 <param name="value_col_cy3" type="data_column" data_ref="input" label="Column for Cy3"/>
57 <param name="value_col_cy5" type="data_column" data_ref="input" label="Column for Cy5"/>
58 </when>
59 </conditional>
60 <param name="col" type="select" label="Line Color">
61 <option value="1">Black</option>
62 <option value="2">Red</option>
63 <option value="3">Green</option>
64 <option value="4">Blue</option>
65 <option value="5">Cyan</option>
66 <option value="6">Magenta</option>
67 <option value="7">Yellow</option>
68 <option value="8">Gray</option>
69 </param>
70 </repeat>
71 </inputs>
72
73 <configfiles>
74 <configfile name="script_file">
75 ## Setup R error handling to go to stderr
76 options( show.error.messages=F,
77 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
78 ## Determine range of all series in the plot
79 options(scipen=999)
80 ciccioo=library(Ringo)
81 pdf( "${out_file1}" )
82 xrange = c( NULL, NULL )
83 xrange_norm = c( NULL, NULL )
84 #for $i, $s in enumerate( $series )
85 s${i} = read.table( "${s.input.file_name}",sep="\t",header=$s.header)
86 #if $i == 0
87 firma=matrix(c("GALAXY","CARPET"),length(s${i}[,${s.chrom_col}]),2,byrow=T)
88 fine=matrix(c(".",".","Cesaroni_et_al."),length(s${i}[,${s.chrom_col}]),3,byrow=T)
89
90 if ("${s.fine_col.si_o_no}"== "no_ce"){
91 coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],as.numeric(s${i}[,${s.start_col}])+${s.fine_col.end_col})
92 }
93 if ("${s.fine_col.si_o_no}"== "si_ce"){
94 coord_gff=cbind(as.character(s${i}[,${s.chrom_col}]),firma,s${i}[,${s.start_col}],s${i}[,${s.fine_col.end_col}])
95 }
96 if ("${s.data.data_type}" == "raw") {
97 totali=log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}]))
98 }
99 if ("${s.data.data_type}" == "log") {
100 totali=s${i}[,${s.data.value_col}]
101 }
102 if ("${s.data.data_type}" == "no_log") {
103 totali=log2(as.numeric(s${i}[,${s.data.value_col}]))
104 }
105
106 #elif $i >0
107 if ("${s.data.data_type}" == "raw") {
108 totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col_cy5}])/as.numeric(s${i}[,${s.data.value_col_cy3}])))
109 }
110 if ("${s.data.data_type}" == "log") {
111 totali=cbind(totali,s${i}[,${s.data.value_col}])
112 }
113 if ("${s.data.data_type}" == "no_log") {
114 totali=cbind(totali,log2(as.numeric(s${i}[,${s.data.value_col}])))
115 }
116 #end if
117 #end for
118
119
120
121 print (paste("number of chips =",$i+1,sep=" "),quote=F)
122 tukey.biweight = function(x, c = 5, epsilon = 1e-04) {
123 m = median(x)
124 s = median(abs(x - m))
125 u = (x - m)/(c * s + epsilon)
126 w = rep(0, length(x))
127 ii = abs(u) &lt;= 1
128 w[ii] = ((1 - u^2)^2)[ii]
129 t.bi = sum(w * x)/sum(w)
130 return(t.bi)
131 }
132 totali=as.data.frame(totali)
133 if ("${type}" == "bwm"){
134 totali.tbw = apply(totali, 2, tukey.biweight)
135 totali_norm = totali - matrix(totali.tbw, nrow = nrow(totali), ncol = ncol(totali), byrow = TRUE)
136 for (i in 1:length(totali.tbw)){
137 print(paste("bi-weight_mean rep",i,"=",format(totali.tbw[i],digits=3),sep=" "),quote=F)
138 }
139 }
140 if ("${type}" == "quantile"){
141 if (length(totali) == 1) {
142 print ("Quantile normalization is not feasible with one sample",quote=F)
143 q()
144 }
145 totali_norm=normalizeBetweenArrays(as.matrix(totali), method="quantile")
146 }
147 if ("${type}" == "none"){
148 totali_norm=totali
149 }
150
151 for (j in 1:length(as.data.frame(totali_norm)))
152 xrange_norm=range(totali_norm[,j],xrange_norm)
153
154 for (jj in 1:length(totali))
155 xrange=range(totali[,jj],xrange)
156
157 plot( NULL, type="n", xlim=xrange, ylim=c(0,1.2), main="Raw signal distribution", xlab="log2(ratio)",ylab="Density")
158 ## Plot each series
159 #for $i, $s in enumerate( $series )
160 lines(density(totali[,${i}+1]), col="${s.col}" )
161 #if $i == 0
162 colori="${s.col}"
163 #elif $i >0
164 colori=rbind(colori,"${s.col}")
165 #end if
166 #end for
167 legend((xrange[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),sep="_"))
168
169
170 plot( NULL, type="n", xlim=xrange_norm, ylim=c(0,1.2), main="Normalized signal distribution", xlab="log2(ratio)",ylab="Density")
171 ## Plot each series
172 #for $i, $s in enumerate( $series )
173 lines(density(totali_norm[,${i}+1]), col="${s.col}" )
174 #end for
175 legend((xrange_norm[1]), 1.2,pch="-", col=as.vector(colori),legend=paste("rep",c(1:(${i}+1)),"norm",sep="_"))
176
177
178
179 if (${i} > 0){
180 corPlot(as.matrix(totali_norm),grouping=paste("rep",c(1:(${i}+1)),"norm",sep="_"))
181 }
182 devname = dev.off()
183 totali_norm=as.data.frame(totali_norm)
184 if ("${sum}" == "mean"){
185 total_sum=apply(totali_norm,1,mean)
186 }
187 if ("${sum}" == "median"){
188 total_sum=apply(totali_norm,1,median)
189 }
190 if ("${sum}" == "none"){
191 total_sum=totali_norm
192 }
193 total_sum=round(total_sum,digits=3)
194 total_gff=cbind(coord_gff,total_sum,fine)
195 cazzolina=sub("CHR","chr",total_gff[,1])
196 total_gff[,1]=as.vector(cazzolina)
197 write.table(total_gff,"${out_file2}",sep="\t",quote=F,col.names=F,row.names=F)
198
199 </configfile>
200 </configfiles>
201
202 <outputs>
203 <data format="pdf" name="out_file1" />
204 <data format="tabular" name="out_file2" />
205 </outputs>
206
207 <help>
208 .. class:: infomark
209
210 **What it does**
211
212 PPT normalizes single ChIP-chip or multi ChIP-chip experiments.
213 PPT also compares the correlation between replicates and produces different plot to better understand the goodness of the experiment and creates a GFF file suitable for PeakPicker analysis.
214
215 PLEASE, for more detailed information refer to the CARPET user Manual:
216 click to download_ it.
217
218 .. _download: /static/example_file/CARPET_userManual.zip
219
220 --------
221
222 **Parameters:**
223
224 - **Normalization:**
225 - **Bi-weight function:** bi-weight function is used to scale all the chips (Standard Nimblegen normalization).
226 - **Quantile:** quantile normalization is performed between all the chips.
227 - **None:** no normalization is performed.
228
229 - **Summarization:**
230 - **Mean:** the final value of each probe is the mean between all the chips.
231 - **Median:** the final value of each probe is the median between all the chips.
232 - **None:** all the values of each probe are given back.
233 - **Chips:**
234 - **Dataset:** input data file.
235 - **Headers:** if headers are present or not in the dataset file.
236 - **Column for chr value:** the column with the probe Chromosome numbers.
237 - **Column for start position:** the column with the probe start positions.
238 - **End column:** if the end position of the probes is present or not.
239 - **Column for end position:** the column with the probe end positions.
240 - **average length of the probes:** the average length of the probes (only for custom chip).
241 - **Data type:** choose between log2(ratio) or raw value (NOT log trasformaed) or Cy3-Cy5 raw values according to data format.
242 - **Column for log2(ratio):** the column with probe log2(ratio) values.
243 - **Column for raw data:** the column with probe raw values (NOT log trasformed).
244 - **Column for Cy3:** the column with probe Cy3 raw value.
245 - **Column for Cy5:** the column with probe Cy5 raw value.
246 - **Line Color:** the line colors for graphs create by the script.
247
248
249
250 -----
251
252 .. class:: warningmark
253
254 This tool requires at least the following fields in each file or dataset:
255 - Chromosome number in this format : chr1 , chr2, etc etc.
256 - Start position
257 - one column with log2(ratio) or two columns with Cy3 and Cy5 raw values
258
259 --------
260
261
262
263 **INPUT FILE**
264
265 This tool accepts any kind of file, with at least the fields described above.
266
267 Click here (pair_file_) to download a Cy3-Cy5 pair file example.
268
269 .. _pair_file: /static/example_file/all_pair.txt.zip
270
271 Click here (raw_value_file_) to download an one color example.
272
273 .. _raw_value_file: /static/example_file/raw_value.txt.zip
274
275 Click here (GFF_file_) to download a GFF log2(ratio) file example.
276
277 .. _GFF_file: /static/example_file/log2ratio_file.txt.zip
278
279
280 ---------
281
282 .. class:: infomark
283
284 **How does it work?**
285
286 For each chip the log2 of Cy5/Cy3 is calculated (if not already present).
287 All the chips are then normalized, according to the type of normalization selected.
288
289 - **bi-weight** procedure scales all the probe log2ratio to center the data around zero. Scaling is performed by subtracting the bi-weight mean for the log2(ratio) values for all features on the array from each log2-ratio value.
290 - **quantile** procedure normalizes the ditributions of the probe log2ratio of each chip with a quantile normalization.
291
292 Moreover, the correlations between chips are calculated and graphs are produced as shown in the following figures.
293
294 .. image:: static/images/CARPET/distribution.png
295
296 .. image:: static/images/CARPET/correlation.png
297
298 The first two graphs are produced using the density function implemented in R.
299 The last graph is produced using the corPlot function implemented in Ringo package.
300 (The last graph is created only if more than one chip is uploaded.)
301
302
303 **OUTPUT FILE**
304
305 - If a summarization method is selected or only one chip is uploaded, a GFF file (ready to be used with PeakPicker) is created.
306 - if NO summarization methods are selected and more than one file is uploaded, the output will be like in the table below:
307
308 .. image:: static/images/CARPET/output_no_sum.png
309 </help>
310 </tool>