diff galaxy/wrappers/FormatForGSEA.xml @ 0:3022feec50fe draft

"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author vandelj
date Fri, 26 Jun 2020 09:36:46 -0400
parents
children d72f1bc5ce9e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy/wrappers/FormatForGSEA.xml	Fri Jun 26 09:36:46 2020 -0400
@@ -0,0 +1,228 @@
+<tool name="GIANT-GSEA Formatting" id="giant_gsea_format" version="0.2.0">
+  <description>Format input files for GSEA software</description>
+  <code file="../../src/General_functions.py"/>
+  <stdio>
+    <regex match="Execution halted"
+           source="both"
+           level="fatal"
+           description="Execution halted." />
+    <exit_code range="10" level="fatal" description="Error during file generation, see log file for more information." />
+  </stdio>
+  <command>	<![CDATA[
+#if $mainCondition.selection=="classicGSEA":
+
+  awk 'BEGIN{FS="\t";OFS="";ORS="";nlines=0} ARGIND==1 && FNR>1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression;
+
+  if [ ! -e $outExpression ]; then
+    printf "[ERROR]Expression file failed" >> $log; 
+    exit 10;
+  fi
+  ;
+  awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes;
+
+  if [ ! -e $outPhenotypes ]; then
+    printf "[ERROR]Phenotype file failed" >> $log; 
+    exit 10;
+  fi
+  ;
+#else:
+
+  if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then
+    printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log;
+    exit 10;
+  fi
+  ;
+
+  awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)<pvalTh){if(rkIndice=="Tstat"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)<pvalTh){if(rkIndice=="FC"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt;
+  printf "#NAME\tSCORE\n" > $outRankedGenes;
+  LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes;
+  rm ./temp.txt;
+
+  if [ ! -e $outRankedGenes ]; then
+    printf "[ERROR]Rank file failed" >> $log; 
+    exit 10;
+  fi
+  ;
+#end if
+
+  printf "[INFO]End of tool script" >> $log; 
+	]]>
+  </command>
+  <inputs>
+    <param type="text" name="title" value="GSEAformat_toPersonalize" label="Title for output (without space)"/>
+    <conditional name="mainCondition">
+      <param name="selection" type="select" label="GSEA configuration" force_select="true">
+      <option value="classicGSEA">GSEA analysis </option>
+      <option value="rankedGSEA">Pre-ranked GSEA analysis</option>
+      </param>
+      <when value="classicGSEA">
+        <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" optional="false"  multiple="false">
+        </param>
+        <param type="data" name="conditionInformation" format="tabular" label="Factor information tabular file" optional="false" multiple="false">
+        </param>
+            <param name="factorToInclude" type="select" label="Reference factor" multiple="false" optional="false"  refresh_on_change="true"
+              dynamic_options="get_column_names_filteredList(mainCondition['conditionInformation'].file_name,[0])">
+        </param>
+      </when>
+      <when value="rankedGSEA">
+      	<param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" multiple="false" help="This file should contain only annotated gene names or only probe identifiers as rows but no both kinds.">
+        </param>
+        <param name="comparisonsToUse" type="select" label="Reference contrast" optional="false" multiple="false" refresh_on_change="true"  dynamic_options="get_column_names_filteredList(mainCondition['differentialAnalysis'].file_name,[0,1],5)">
+          <validator type="empty_field" message="You should specify one factor"></validator>
+        </param>
+        <param type="select" name="rankingIndice" display="radio" label="Reference statistic">
+            <option value="FC">Relative value of Log2(Fold Change)</option>
+            <option value="absFC">Absolute value of Log2(Fold Change)</option>
+            <option value="Tstat">Relative value of moderated t-statistic</option>
+            <option value="absTstat">Absolute value of moderated t-statistic</option>
+        </param>
+         <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold">
+                   <validator type="in_range" min="0" max="1" exclude_min="true" message="Threshold should be between 0 and 1"/>
+          </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="gct" name="outExpression" label="${title}_Expressions">
+    <filter>mainCondition['selection']=="classicGSEA"</filter>
+    </data>
+
+    <data format="cls" name="outPhenotypes" label="${title}_Phenotypes">
+    <filter>mainCondition['selection']=="classicGSEA"</filter>
+    </data>
+
+    <data format="rnk" name="outRankedGenes" label="${title}_Ranked_Genes">
+    <filter>mainCondition['selection']=="rankedGSEA"</filter>
+    </data>
+	
+    <data format="txt" name="log" label="${title}_Log" />
+  </outputs>  
+
+ <tests>
+  <test maxseconds="3600" >
+    <param name="selection" value="classicGSEA" />
+    <param name="expressionData" value="./NormalizedData.tabular" />
+    <param name="conditionInformation" value="./conditionGroups.txt" />
+    <param name="factorToInclude" value="Treatment" />
+    <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" />
+    <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesTreatment.cls" />
+    <output name="log" file="./GSEA-Formatting/outputRanks.log" />
+  </test>
+  <test maxseconds="3600" >
+    <param name="selection" value="classicGSEA" />
+    <param name="expressionData" value="./NormalizedData.tabular" />
+    <param name="conditionInformation" value="./conditionGroups.txt" />
+    <param name="factorToInclude" value="Type" />
+    <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" />
+    <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesType.cls" />
+    <output name="log" file="./GSEA-Formatting/outputRanks.log" />
+  </test>
+  <test maxseconds="3600" >
+    <param name="selection" value="rankedGSEA" />
+    <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" />
+    <param name="comparisonsToUse" value="WT*Control-KO*Control" />
+    <param name="rankingIndice" value="FC" />
+    <param name="pvalThreshold" value="0.05" />
+    <output name="outRankedGenes" file="./GSEA-Formatting/outputRanks.rnk" />
+    <output name="log" file="./GSEA-Formatting/outputRanks.log" />
+  </test>
+</tests> 
+  <help>
+  <![CDATA[
+**What it does** 
+
+Generate input files required for GSEA analysis.
+
+-----
+
+**Parameters**
+
+\- **Title** to personalize output file names (please avoid special characters).
+
+\- **GSEA configuration** : "GSEA analysis" requires expression dataset and phenotype label for each sample whereas "Pre-ranked GSEA Analysis" needs ranked list of samples extracted from differential analysis results.
+
+- **GSEA Analysis**
+
+    \- **Expression tabular file** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers).
+
+    ::
+
+        Conditions  157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL  155_(HuGene-2_0-st).CEL    154_(HuGene-2_0-st).CEL                        
+        DDX11L2     4.500872                4.429759                 4.780281                   4.996189             
+        MIR1302-2   3.415065                3.520472                 3.471503                   3.567988           
+        OR4F5       3.737956                3.011586                 3.424494                   3.497545
+        VWA1        5.189621                5.129595                 4.806793                   5.227014
+
+    \- **Factor information tabular file** with factors as columns and samples as rows (header row contains factor names and first column sample names).
+
+    ::
+
+        Conditions                Sex   Treatment Reaction
+        138_(HuGene-2_0-st).CEL   1     TreatA    Pos
+        148_(HuGene-2_0-st).CEL   0     NoTreat   Pos
+        139_(HuGene-2_0-st).CEL   0     TreatB    Neg
+        149_(HuGene-2_0-st).CEL   0     NoTreat   Neg
+
+    \- **Reference factor** to use as phenotype in GSEA amongst available factors in factor information file.
+
+- **Pre-ranked GSEA Analysis**
+
+    \- **Differential analysis tabular file** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and moderated t-statistic) as columns and genes as rows (first and second rows contain contrasts definition and first and second columns contain gene identifiers and functional informations).  Please respect the GIANT-Differential Expression Analysis with LIMMA tool output format.
+
+      ::
+
+        LIMMA    comparison  WT*Treat  WT*Treat  WT*Treat  WT*Treat  WT*Treat
+        Gene     Info        p-val     FDR.p-val FC        log2(FC)  t-stat
+        ARSD     na          0.0057    0.41      0.8389   -0.2534   -5.175
+        TTTY10   na          1.6e-07   0.0074    0.6403   -0.6432   -6.122
+        MIR548AL na          0.072     0.2914    1.711     0.775     10.43
+
+
+    \- **Reference contrast** from available contrasts in differential analysis file to use for gene ranking.
+
+    \- **Reference statistic** from reference contrast used to rank genes. Relative or absolute value of log2(FC) or moderated t-statistic is used to sort genes in a decreasing way.
+
+    \- **FDR p-val threshold** to discard genes with higher FDR p-val than specific threshold [0.05 recommended]. Genes with high FDR p-val are not significant for differential expression in reference contrast.
+
+
+-----
+
+**Outputs**
+
+Depends on GSEA configuration:
+
+- **GSEA Analysis**
+
+    \- **phenotype file (.cls)** to use as "phenotype labels" file for GSEA.
+
+    \- **expression file (.gct)** to use as "expression dataset" file for GSEA.
+
+- **Pre-ranked GSEA Analysis**
+
+    \- **pre-ranked file (.rnk)** to use as "ranked list" file for GSEA pre-ranked.
+
+For all configurations:
+
+\- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report.
+]]> 
+  </help>
+
+ <citations>
+  <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870}
+  }</citation>
+
+  <citation type="bibtex">@article {Subramanian15545,
+  author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.},
+  title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles},
+  volume = {102},
+  number = {43},
+  pages = {15545--15550},
+  year = {2005},
+  publisher = {National Academy of Sciences},
+  issn = {0027-8424},
+  journal = {Proceedings of the National Academy of Sciences}
+  }</citation>
+ </citations>
+
+</tool>