Mercurial > repos > vandelj > giant_gsea_format
view galaxy/wrappers/FormatForGSEA.xml @ 1:d72f1bc5ce9e draft
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit e2b27d6ff2eab66454f984dbf1a519192f41db97"
author | vandelj |
---|---|
date | Wed, 09 Sep 2020 10:36:26 +0000 |
parents | 3022feec50fe |
children |
line wrap: on
line source
<tool name="GIANT-GSEA Formatting" id="giant_gsea_format" version="0.2.0"> <description>Format input files for GSEA software</description> <code file="../../src/General_functions.py"/> <!--<code file="./src/General_functions.py"/> change for Planemo test--> <stdio> <regex match="Execution halted" source="both" level="fatal" description="Execution halted." /> <exit_code range="10" level="fatal" description="Error during file generation, see log file for more information." /> </stdio> <command> <![CDATA[ #if $mainCondition.selection=="classicGSEA": awk 'BEGIN{FS="\t";OFS="";ORS="";nlines=0} ARGIND==1 && FNR>1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression; if [ ! -e $outExpression ]; then printf "[ERROR]Expression file failed" >> $log; exit 10; fi ; awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes; if [ ! -e $outPhenotypes ]; then printf "[ERROR]Phenotype file failed" >> $log; exit 10; fi ; #else: if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log; exit 10; fi ; awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)<pvalTh){if(rkIndice=="Tstat"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)<pvalTh){if(rkIndice=="FC"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt; printf "#NAME\tSCORE\n" > $outRankedGenes; LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes; rm ./temp.txt; if [ ! -e $outRankedGenes ]; then printf "[ERROR]Rank file failed" >> $log; exit 10; fi ; #end if printf "[INFO]End of tool script" >> $log; ]]> </command> <inputs> <param type="text" name="title" value="GSEAformat_toPersonalize" label="Title for output (without space)"/> <conditional name="mainCondition"> <param name="selection" type="select" label="GSEA configuration" force_select="true"> <option value="classicGSEA">GSEA analysis </option> <option value="rankedGSEA">Pre-ranked GSEA analysis</option> </param> <when value="classicGSEA"> <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" optional="false" multiple="false"> </param> <param type="data" name="conditionInformation" format="tabular" label="Factor information tabular file" optional="false" multiple="false"> </param> <param name="factorToInclude" type="select" label="Reference factor" multiple="false" optional="false" refresh_on_change="true" dynamic_options="get_column_names_filteredList(mainCondition['conditionInformation'].file_name,[0])"> </param> </when> <when value="rankedGSEA"> <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" multiple="false" help="This file should contain only annotated gene names or only probe identifiers as rows but no both kinds."> </param> <param name="comparisonsToUse" type="select" label="Reference contrast" optional="false" multiple="false" refresh_on_change="true" dynamic_options="get_column_names_filteredList(mainCondition['differentialAnalysis'].file_name,[0,1],5)"> <validator type="empty_field" message="You should specify one factor"></validator> </param> <param type="select" name="rankingIndice" display="radio" label="Reference statistic"> <option value="FC">Relative value of Log2(Fold Change)</option> <option value="absFC">Absolute value of Log2(Fold Change)</option> <option value="Tstat">Relative value of moderated t-statistic</option> <option value="absTstat">Absolute value of moderated t-statistic</option> </param> <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold"> <validator type="in_range" min="0" max="1" exclude_min="true" message="Threshold should be between 0 and 1"/> </param> </when> </conditional> </inputs> <outputs> <data format="gct" name="outExpression" label="${title}_Expressions"> <filter>mainCondition['selection']=="classicGSEA"</filter> </data> <data format="cls" name="outPhenotypes" label="${title}_Phenotypes"> <filter>mainCondition['selection']=="classicGSEA"</filter> </data> <data format="rnk" name="outRankedGenes" label="${title}_Ranked_Genes"> <filter>mainCondition['selection']=="rankedGSEA"</filter> </data> <data format="txt" name="log" label="${title}_Log" /> </outputs> <tests> <test maxseconds="3600" > <conditional name="mainCondition"> <param name="selection" value="classicGSEA" /> <param name="expressionData" value="./NormalizedData.tabular" /> <param name="conditionInformation" value="./FactorFileGenerator/output/conditionsFile.csv" /> <param name="factorToInclude" value="Treatment" /> </conditional> <output name="outExpression" file="./FormatForGSEA/output/outputExpression_A.gct" /> <output name="outPhenotypes" file="./FormatForGSEA/output/outputPhenotypes_A.cls" /> <output name="log" file="./FormatForGSEA/output/outputLog_A.txt" /> </test> <test maxseconds="3600" > <conditional name="mainCondition"> <param name="selection" value="rankedGSEA" /> <param name="differentialAnalysis" value="./DiffExprLimma/output/outputStat.csv" /> <param name="comparisonsToUse" value="TreatVsControl" /> <param name="rankingIndice" value="absFC" /> <param name="pvalThreshold" value="0.05" /> </conditional> <output name="outRankedGenes" file="./FormatForGSEA/output/outputRanks_B.rnk" /> <output name="log" file="./FormatForGSEA/output/outputLog_B.txt" /> </test> </tests> <help> <![CDATA[ **What it does** Generate input files required for GSEA analysis. ----- **Parameters** \- **Title** to personalize output file names (please avoid special characters). \- **GSEA configuration** : "GSEA analysis" requires expression dataset and phenotype label for each sample whereas "Pre-ranked GSEA Analysis" needs ranked list of samples extracted from differential analysis results. - **GSEA Analysis** \- **Expression tabular file** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers). :: Conditions 157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL 155_(HuGene-2_0-st).CEL 154_(HuGene-2_0-st).CEL DDX11L2 4.500872 4.429759 4.780281 4.996189 MIR1302-2 3.415065 3.520472 3.471503 3.567988 OR4F5 3.737956 3.011586 3.424494 3.497545 VWA1 5.189621 5.129595 4.806793 5.227014 \- **Factor information tabular file** with factors as columns and samples as rows (header row contains factor names and first column sample names). :: Conditions Sex Treatment Reaction 138_(HuGene-2_0-st).CEL 1 TreatA Pos 148_(HuGene-2_0-st).CEL 0 NoTreat Pos 139_(HuGene-2_0-st).CEL 0 TreatB Neg 149_(HuGene-2_0-st).CEL 0 NoTreat Neg \- **Reference factor** to use as phenotype in GSEA amongst available factors in factor information file. - **Pre-ranked GSEA Analysis** \- **Differential analysis tabular file** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and moderated t-statistic) as columns and genes as rows (first and second rows contain contrasts definition and first and second columns contain gene identifiers and functional informations). Please respect the GIANT-Differential Expression Analysis with LIMMA tool output format. :: LIMMA comparison WT*Treat WT*Treat WT*Treat WT*Treat WT*Treat Gene Info p-val FDR.p-val FC log2(FC) t-stat ARSD na 0.0057 0.41 0.8389 -0.2534 -5.175 TTTY10 na 1.6e-07 0.0074 0.6403 -0.6432 -6.122 MIR548AL na 0.072 0.2914 1.711 0.775 10.43 \- **Reference contrast** from available contrasts in differential analysis file to use for gene ranking. \- **Reference statistic** from reference contrast used to rank genes. Relative or absolute value of log2(FC) or moderated t-statistic is used to sort genes in a decreasing way. \- **FDR p-val threshold** to discard genes with higher FDR p-val than specific threshold [0.05 recommended]. Genes with high FDR p-val are not significant for differential expression in reference contrast. ----- **Outputs** Depends on GSEA configuration: - **GSEA Analysis** \- **phenotype file (.cls)** to use as "phenotype labels" file for GSEA. \- **expression file (.gct)** to use as "expression dataset" file for GSEA. - **Pre-ranked GSEA Analysis** \- **pre-ranked file (.rnk)** to use as "ranked list" file for GSEA pre-ranked. For all configurations: \- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report. ]]> </help> <citations> <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870} }</citation> <citation type="bibtex">@article {Subramanian15545, author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.}, title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles}, volume = {102}, number = {43}, pages = {15545--15550}, year = {2005}, publisher = {National Academy of Sciences}, issn = {0027-8424}, journal = {Proceedings of the National Academy of Sciences} }</citation> </citations> </tool>