view galaxy/wrappers/ArrayNormalization.xml @ 0:708f43bda2b6 draft

"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author vandelj
date Fri, 26 Jun 2020 09:35:11 -0400
parents
children 2762a2622e0d
line wrap: on
line source

<tool name="GIANT-Normalization with APT Summarize" id="giant_aptsummarize" version="0.1.1">
  <description>Apply Affymetrix Power Tool summarize function to .CEL collection</description>
  <requirements>
    <requirement type="package" version="2.10.1">apt-probeset-summarize</requirement>
  </requirements>
  <stdio>
    <regex match="Execution halted"
           source="both"
           level="fatal"
           description="Execution halted, please contact tool developer or administrators." />
    <exit_code range="10" level="fatal" description="Error in post-treatments, see log file for more information." />
    <exit_code range="1:9" level="fatal" description="Error during apt command execution, see log file for more information." />
  </stdio>



   <command>	<![CDATA[
  ##ONLY FOR LOCAL TEST PURPOSE, COMMENT OTHERWISE
  ##set $pathToApt='/mnt/galaxy/home/galaxy/Software/ThermoFischer/apt-1.20.6-x86_64-intel-linux/bin'

    mkdir ./apt_output/;

  #if $mainCondition.arraySelection=="other":
    #if $CDFfile:
    ##.cdf file should has original name
    cp $CDFfile ./${CDFfile.name};
    #end if
  #end if


##run APT command

##ONLY FOR LOCAL TEST PURPOSE COMMENT IF NOT NECESSARY
##${pathToApt}/apt-probeset-summarize -a $commandLine 
##AND UNCOMMENT LINE BELOW
apt-probeset-summarize -a $commandLine 

#if $mainCondition.arraySelection=="other":
  #if $PGFfile and $CLFfile:
   --use-pgf-names 
  -p $PGFfile
  -c $CLFfile
  #end if 
  #if $CDFfile:
  -d ./${CDFfile.name}
  #end if
  #if $MPSfile and $normLevel=="core":
    -m $MPSfile
  #end if
  #if $BGPfile:
    -b $BGPfile 
  #end if

#else:
  
  #if $mainCondition.arrayID.fields.pathPGF!="null" and $mainCondition.arrayID.fields.pathCLF!="null":
  --use-pgf-names 
  -p $mainCondition.arrayID.fields.pathPGF
  -c $mainCondition.arrayID.fields.pathCLF
  #else:
    #if $mainCondition.arrayID.fields.pathCDF!="null":
    -d $mainCondition.arrayID.fields.pathCDF
    #end if
  #end if
  #if $normLevel=="core" and $mainCondition.arrayID.fields.pathMPS!="null":
    -m $mainCondition.arrayID.fields.pathMPS
  #end if
  #if $mainCondition.arrayID.fields.pathBGP!="null":
  -b $mainCondition.arrayID.fields.pathBGP
  #end if
#end if

-o ./apt_output/

\$(echo '$inputData' | tr "," " ") >> $log 2>&1;

ret_code=\$?;

printf "[INFO]Normalized files: "$inputData >> $log; 

if [ \$ret_code != 0 ]; then
    exit \$ret_code;
else

  ##launch post-treatment scripts

  bash $scriptFormat;
  #if $annotationCondition.addAnnotation=="true":
    echo "[INFO]Annotation file used:" >> $log;
    #if $mainCondition.arraySelection=="other":
      bash $scriptAnnotation $mainCondition.annotationFile;
      echo $mainCondition.annotationFile.name >> $log;
    #else:
      #if $normLevel=="core":
       bash $scriptAnnotation $mainCondition.arrayID.fields.pathAnnotTrans;
      #else:
       bash $scriptAnnotation $mainCondition.arrayID.fields.pathAnnotProbe;
      #end if
      cat $mainCondition.arrayID.fields.versionInfo >> $log;
    #end if
  #end if
  fi;
  printf "[INFO]End of tool script" >> $log; 
	]]>
  </command>



  <configfiles>
    <configfile name="scriptFormat">
<![CDATA[
#for $inputDataset in $inputData
        echo '${inputDataset.name}' >> ./tempColumnName.txt
#end for
awk ' BEGIN{firstLine=1;OFS="";ORS=""} ARGIND==1{tab[FNR]=$1;nbCol=FNR} ARGIND==2 && !($1 ~ /\\#/){if(firstLine==0){print "\n"\$0}else{print "Conditions";for(i=1;i<=nbCol;i++){print "\t"tab[i]};firstLine=0}}' ./tempColumnName.txt ./apt_output/*summary.txt  > $outputData

if [ ! -e $outputData ]; then
  printf "[ERROR]Formating results failed" >> $log; 
  exit 10
fi
]]>
   </configfile> 

   <configfile name="scriptAnnotation">
<![CDATA[
#if $annotationCondition.addAnnotation=="true":
#if $annotationCondition.mergingMethod=="none":

  #if $annotationCondition.keepAnnotated=="true":
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0} ARGIND==2 && FNR>1{if(\$1 in dico){\$1=dico[\$1]"_("\$1")";print \$0}}' \$1 $outputData > ./tempData
  #else:
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0} ARGIND==2 && FNR>1{if(\$1 in dico){\$1=dico[\$1]"_("\$1")"};print \$0}' \$1 $outputData > ./tempData
  #end if

#end if 
#if $annotationCondition.mergingMethod=="mean":

  #if $annotationCondition.keepAnnotated=="true":
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){cpt[dico[\$1]]++;for(i=2;i<=nbCol;i++)sum[dico[\$1]][i]+=\$i}} END{for(iName in cpt){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"sum[iName][iCol]/cpt[iName]};print line}}' \$1 $outputData > ./tempData
  #else:
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){cpt[dico[\$1]]++;for(i=2;i<=nbCol;i++)sum[dico[\$1]][i]+=\$i}else{print \$0}} END{for(iName in cpt){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"sum[iName][iCol]/cpt[iName]};print line}}' \$1 $outputData > ./tempData
  #end if

#end if 

#if $annotationCondition.mergingMethod=="higherVar":

  #if $annotationCondition.keepAnnotated=="true":
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(curVar>var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
  #else:
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(curVar>var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}else{print \$0}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
  #end if

#end if 

#if $annotationCondition.mergingMethod=="lowerVar":

  #if $annotationCondition.keepAnnotated=="true":
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(var[dico[\$1]]==0 || curVar<var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
  #else:
    awk 'BEGIN{OFS="\t"} ARGIND==1{dico[\$1]=\$2} ARGIND==2 && FNR==1{print \$0;nbCol=NF} ARGIND==2 && FNR>1{if(\$1 in dico){mean=0;for(i=2;i<=nbCol;i++){mean+=\$i};mean=mean/(nbCol-1);curVar=0;for(i=2;i<=nbCol;i++){curVar+=(mean-\$i)^2};if(var[dico[\$1]]==0 || curVar<var[dico[\$1]]){var[dico[\$1]]=curVar;for(i=2;i<=nbCol;i++){bestVar[dico[\$1]][i]=\$i}}}else{print \$0}} END{for(iName in bestVar){line=iName;for(iCol=2;iCol<=nbCol;iCol++){line=line"\t"bestVar[iName][iCol]};print line}}' \$1 $outputData > ./tempData
  #end if

#end if 

if [ -e ./tempData ]; then
  mv -f ./tempData $outputData
else
  printf "[ERROR]Annotation merge failed" >> $log; 
  exit 10
fi

#end if 

]]>
   </configfile> 
  </configfiles>



  <inputs>
    <param type="text" name="title" value="APT_toPersonalize" label="Title for output"/>

    <param type="data" name="inputData" format="cel" label=".cel collection file" optional="false" multiple="true">
      <validator type="empty_dataset" message="At least one data file should be selected"></validator>
    </param>

    <param type="select" name="commandLine" display="radio" label="Normalization to perform" help="For more details go to APT webpage">
        <option value="rma-gc-scale" selected="true">gc correction + scale intensity + rma</option>
        <option value="scale-intensities,rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish">scale intensity + rma</option>
        <option value="gc-correction,rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish">gc correction + rma</option>
        <option value="rma">rma</option>
    </param>

    <param type="select" name="normLevel" display="radio" label="Normalization level" help="'Core genes' option is not available for all arrays">
      <option value="core">Core genes</option>
      <option value="probeset">Probe set</option>
    </param>

    <conditional name="mainCondition">

      <param name="arraySelection" type="select" label="Select GeneChip array configuration files">
      <option value="common">pre-loaded array files</option>
      <option value="other">array files selection</option>
      </param>

      <when value="common">
        <param name="arrayID" label="Name" type="select">
          <options from_data_table="aptTool">
          </options>
        </param>
      </when>

      <when value="other">

    <param type="data" format="data" name="PGFfile" label=".pgf indicates which probes are grouped together into a probeset*" optional="true" multiple="false">
    </param>

    <param type="data" format="data" name="CLFfile" label=".clf indicates where the probes are located in the CEL file*" optional="true" multiple="false">
    </param>

    <param type="data" format="data" name="CDFfile" label=".cdf contains both .pgf and .clf informations* (requested only if .pgf and .clf files are not available)" optional="true" multiple="false">
    </param>

    <param type="data" format="data" name="MPSfile" label=".mps defines a probeset as a collection of existing probesets* (requested for normalization at gene level)" optional="true" multiple="false" help="If .mps is not selected/available a 'probeset level' normalization will be automatically performed">
    </param>

    <param type="data" format="data" name="BGPfile" label=".bgp indicates which probes are to be used for computing background* (requested only if GC driven background correction is applied)" optional="true" multiple="false" help="*Files should be downloaded from corresponding array specific Affymetrix webpage to insure format compatibility">
    </param>

    <param type="data" name="annotationFile" format="tabular" label="tabular file containing available probesets annotation (requested if annotation options are selected)" optional="true" multiple="false" >
    </param>

      </when>
    </conditional>

  <conditional name="annotationCondition">
    <param type="boolean" name="addAnnotation" checked="false" label="Add gene annotation">
    </param> 
    <when value="true">
      <param type="boolean" name="keepAnnotated" checked="false" label="Discard probe set without gene annotation">
      </param>

      <param type="select" name="mergingMethod" display="radio" label="Merging approach for probe set with same gene annotation">
        <option value="none">No merging</option>
        <option value="mean" selected="true">Mean between probes [recommended]</option>
        <option value="higherVar">Keep probe with higher variance</option>
        <option value="lowerVar">Keep probe with lower variance</option>
      </param>
    </when>
    <when value="false">
    </when>
  </conditional>
  </inputs>



  <outputs>
    <data format="tabular" name="outputData" label="${title}_NormalizedData"/>
    <data format="txt" name="log" label="${title}_Log" />
  </outputs>
  


 <tests>
  <test maxseconds="3600">
  	<param name="inputData" value="./CELfiles/GSM205766.CEL,./CELfiles/GSM205767.CEL,./CELfiles/GSM205768.CEL" />
    <param name="commandLine" value="rma-gc-scale" />
    <param name="normLevel" value="core" />
    <param name="arraySelection" value="other" />
    <param name="CDFfile" value="./Mouse430_2.cdf" />
    <param name="annotationFile" value="./formatedAnnotation.csv" />
    <param name="addAnnotation" value="true" />
    <param name="keepAnnotated" value="false" />
    <param name="mergingMethod" value="mean" />
    <output name="outputData" file="./APT-summarize/expressionOutput.csv" />
    <output name="log" file="./APT-summarize/output.log" lines_diff="2" />
  </test>
</tests> 




  <help>
  <![CDATA[
**What it does** 

To normalize expression data from Affymetrix GeneChip arrays through Affymetrix Power Tools software.

-----

**Parameters**

\- **Title** to personalize output file names (please avoid special characters).


\- **.CEL files** you want to normalize (you can select multiple .CEL files or unique collection).


\- **Analysis to perform** : as this tool run the 'apt-probeset-summarize' command line from Affymetrix Power Tool software, this field represents the desired analysis pathway (-a parameter). Analysis pathways are summarized through predifined pathway alias with full description of all run steps below:

    - gc correction + scale intensity + rma: "gc-correction,scale-intensities,rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish"

    - scale intensity + rma: "scale-intensities,rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish"

    - gc correction + rma: "gc-correction,rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish"

    - rma: "rma-bg,quant-norm.sketch=0.usepm=true.bioc=true,pm-only,med-polish"

Please refer to `APT tool web page`__ to get more details about analysis steps.

\- **Normalization level** to choose when GeneChip array kind allows such distinction (ie. when a .mps file exists in Affymetrix webpage). 'Probe set level' is the first level where probes for same exon are merged in probe sets. 'Gene level' [recommended] is an upper level where several probe sets are merged in meta probe sets. If a meta probe set can be see as a good single gene expression indicator, several meta probe sets can still share same gene annotation.

\- **GeneChip array configuration files** according to .CEL files. Configuration files for given arrays can be pre-loaded by administrators of the used Galaxy instance and listed in the aptTool.loc file. In this case you just have to select the array kind according to .CEL files, thus pre-loaded configuration files will be used automatically for normalization. If configuration files were not pre-loaded, you have to choose "array files selection" and then select manually configuration files for the corresponding array. Configuration files (.pgf, .clf, .cdf, .mps, .bgp) are available in library archive files from the Affymetrix webpage of the array (as an example, configuration files for HTA2.0 array are found in the 'Main Analysis Files' in the 'Support file' section of `this page`__). All these files are not mandatory for all normalizations. For basic normalization, both .pgf and.clf files or only .cdf file are mandatory. .mps file is requested for gene level normalization if it is available (see Normalization level parameter). .bgp file is requested for analysis pathway with specific background computation. Annotation file is a simple two column tabular file, with first column containing probe set names and second corresponding annotations, it should not contains any column header. This file can be generated from Affymetrix annotation file available in the same section as array configuration files in Affymetrix webpages, please keep a trace of used annotation version to insure reproducibility.

Extract of annotation file::

    TC01000020.hg.1  SAMD11 
    TC01000021.hg.1  KLHL17 
    TC01000022.hg.1  PLEKHN1 
    TC01000023.hg.1  ISG15 
    TC01000024.hg.1  AGRN 

\- **Gene annotation** : add gene annotation when available (for pre-loaded arrays, annotation file path should be filled in the aptTool.loc file, for other arrays, annotation file should be selected manually). You can choose to keep or discard in normalized file (meta) probe sets without match in annotation file and select merging policy for probe sets sharing same gene annotation.

-----

**Advanced parameters**

\- **Additional parameter** to give to 'apt-probeset-summarize' command line except -a, -o, -d, -p, -c, -m, -b parameters already filled by previous fields.

-----

**Outputs**

\- **tabular file** containing log2 transformed normalized data as tab delimited matrix. First row contains .CEL names and first column contains (meta) probe set names and/or annotated gene names::

    Conditions  157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL  155_(HuGene-2_0-st).CEL    154_(HuGene-2_0-st).CEL                        
    DDX11L2     4.500872                4.429759                 4.780281                   4.996189             
    MIR1302-2   3.415065                3.520472                 3.471503                   3.567988           
    OR4F5       3.737956                3.011586                 3.424494                   3.497545
    VWA1        5.189621                5.129595                 4.806793                   5.227014

\- **LOG file** containing information about execution. Useful especially if tool execution fais. Please attach this log file in any bug report.

    .. __: http://media.affymetrix.com/support/developer/powertools/changelog/apt-probeset-summarize.html#intro

    .. __: https://www.thermofisher.com/order/catalog/product/902233#/902233

]]> 
  </help>


 <citations>
   <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870}
  }</citation>

  <citation type="bibtex">@online{apt, author = {Thermo Fisher Scientific}, title = {Affymetrix Power Tools}, publisher = {Life Technologies Corp.}, address = {USA, MA}, version = {2.10.2}, year = {2018}, url = {https://www.affymetrix.com/support/developer/powertools/changelog/index.html}
  }</citation>
 </citations>
</tool>