Mercurial > repos > sblanck > mpagenomics
diff preprocess.xml @ 0:4d539083cf7f draft
planemo upload for repository https://github.com/sblanck/MPAgenomics4Galaxy/tree/master/mpagenomics_wrappers commit 689d0d8dc899a683ee18700ef385753559850233-dirty
author | sblanck |
---|---|
date | Tue, 12 May 2020 10:40:36 -0400 |
parents | |
children | 3fcbb8030fcc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocess.xml Tue May 12 10:40:36 2020 -0400 @@ -0,0 +1,159 @@ +<tool id="preprocess" name="Data Normalization" force_history_refresh="True" version="1.1.0"> + <requirements> + <!--requirement type="set_environment">R_SCRIPT_PATH</requirement--> + <requirement type="package" version="1.1.2">mpagenomics</requirement> + </requirements> + <!--command interpreter="python"--> + <command> + <![CDATA[ + Rscript + ${__tool_directory__}/preprocess.R + --summary '$summary' + --new_file_path '$__new_file_path__' + --inputcdffull_name '$inputcdffull.name' + --inputufl_name '$inputufl.name' + --inputugp_name '$inputugp.name' + --inputacs_name '$inputacs.name' + --inputcdffull '$inputcdffull' + --inputufl '$inputufl' + --inputugp '$inputugp' + --inputacs '$inputacs' + --dataSetName '$datasetName' + #if $settings.settingsType == "tumor": + --tumorcsv '$tumorcsv' + #end if + #if $settings.settingsType == "standard": + --tumorcsv 'none' + #end if + --settingsType '$settings.settingsType' + --outputgraph '$outputgraph' + --zipfigures '$zipfigures' + --outputlog '$outputlog' + --log '$log' + --user_id '$__user_id__' + --input "#for $input in $inputs# $input;$input.name, #end for#" + ]]> + + </command> + <inputs> + <param name="datasetName" type="text" label="Dataset Name"/> + <param name="inputs" type="data" format="cel" multiple="True" label="Cel files dataset" help="Cel files dataset previously uploaded with the Multiple File Datasets tool."/> + <param name="inputcdffull" type="data" format="cdf" label="cdf file" help=".cdf file name must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf)." /> + <param name="inputufl" type="data" format="ufl" label="ufl file" help=".ufl file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl)."/> + <param name="inputugp" type="data" format="ugp" label="ugp file" help=".ugp file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp)."/> + <param name="inputacs" type="data" format="acs" label="acs file" help=".acs file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs)."/> + <conditional name="settings"> + <param name="settingsType" type="select" label="Reference"> + <option value="standard">Study without reference</option> + <option value="tumor">Normal-tumor study with TumorBoost</option> + </param> + <when value="standard" /> + <when value="tumor"> + <param name="tumorcsv" type="data" format="csv" label="TumorBoost csv file" help="Normal-tumor csv file. See below for more information."/> + </when> + </conditional> + <!--param name="outputgraph" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output figures" /--> + <!--param name="outputlog" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output log" /--> + <param name="outputgraph" type="select" label="Output figures"> + <option value="TRUE">Yes</option> + <option value="FALSE">No</option> + </param> + <param name="outputlog" type="select" label="Output log"> + <option value="TRUE">Yes</option> + <option value="FALSE">No</option> + </param> + <!--param name="chipType" type="text" label="chipType" /--> + <!--param name="workspace" type="text" label="Workspace"/--> + </inputs> + + <outputs> + <!-- Would like to make this hidden or not appear all together, but + variable outputs require a primary dataset. If hidden refresh + doesn't occur. + --> + <data format="dsf" name="summary" label="Dataset summary file of ${datasetName}" /> + <data format="zip" name="zipfigures" label="figures of normalization of ${datasetName}"> + <filter>outputgraph == "TRUE"</filter> + </data> + <data format="log" name="log" label="log of normalization ${datasetName}"> + <filter>outputlog == "TRUE"</filter> + </data> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" description="See logs for more details" /> + </stdio> + + <help> + +**What it does** + +This preprocessing step consists in a correction of biological and technical biaises due to the experiment. Raw data from Affymetrix arrays are provided in different CEL files. These data must be normalized before statistical analysis. +The pre-processing is proposed as a wrapper of aroma.* packages (using CRMAv2 and TumorBoost when appropriate). Note that this implies that the pre-processing step is only available for Affymetrix arrays. + +----- + +**Chip file naming conventions** + +Chip filenames must strictly follow the following rules : + +- *.cdf* filename must comply with the following format : < chiptype >,< tag >.cdf (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full.cdf). Note the use of a comma (not a point) between <chiptype> and the tag "Full". + +- *.ufl* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ufl). + +- *.ugp* filename must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,Full,na31,hg19,HB20110328.ugp). + +- *.acs* file name must start with < chiptype >,< tag > (e.g, for a GenomeWideSNP_6 chip: GenomeWideSNP_6,HB20080710.acs). + +----- + +**Normal-tumor study with TumorBoost** + +In cases where normal (control) samples match to tumor samples, normalization can be improved using TumorBoost. In this case, a normal-tumor csv file must be provided : + + - The first column contains the names of the files corresponding to normal samples of the dataset. + + - The second column contains the names of the tumor samples files. + + - Column names of these two columns are respectively normal and tumor. + + - Columns are separated by a comma. + + - *Extensions of the files (.CEL for example) should be removed* + + + +**Example** + +Let 6 .cel files in the dataset studied (3 patients, each of them being represented by a couple of normal and tumor cel files.) :: + + patient1_normal.cel + patient1_tumor.cel + patient2_normal.cel + patient2_tumor.cel + patient3_normal.cel + patient3_tumor.cel + + +The csv file should look like this :: + + normal,tumor + patient1_normal,patient1_tumor + patient2_normal,patient2_tumor + patient3_normal,patient3_tumor + + +----- + +**Citation** + +When using this tool, please cite : + +`Q. Grimonprez, A. Celisse, M. Cheok, M. Figeac, and G. Marot. MPAgenomics : An R package for multi-patients analysis of genomic markers, 2014. Preprint <http://fr.arxiv.org/abs/1401.5035>`_ + +As CRMAv2 normalization is used, please also cite `H. Bengtsson, P. Wirapati, and T. P. Speed. A single-array preprocessing method for estimating full-resolution raw copy numbers from all Affymetrix genotyping arrays including GenomeWideSNP 5 & 6. Bioinformatics, 5(17):2149–2156, 2009. <http://bioinformatics.oxfordjournals.org/content/25/17/2149.short>`_ + +When using TumorBoost to improve normalization in a normal-tumor study, please cite `H. Bengtsson, P. Neuvial, and T. P. Speed. TumorBoost: Normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays. BMC Bioinformatics, 11, 2010 <http://www.biomedcentral.com/1471-2105/11/245>`_ + + </help> +</tool>