comparison galaxy/wrappers/FormatForGSEA.xml @ 0:3022feec50fe draft

"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author vandelj
date Fri, 26 Jun 2020 09:36:46 -0400
parents
children d72f1bc5ce9e
comparison
equal deleted inserted replaced
-1:000000000000 0:3022feec50fe
1 <tool name="GIANT-GSEA Formatting" id="giant_gsea_format" version="0.2.0">
2 <description>Format input files for GSEA software</description>
3 <code file="../../src/General_functions.py"/>
4 <stdio>
5 <regex match="Execution halted"
6 source="both"
7 level="fatal"
8 description="Execution halted." />
9 <exit_code range="10" level="fatal" description="Error during file generation, see log file for more information." />
10 </stdio>
11 <command> <![CDATA[
12 #if $mainCondition.selection=="classicGSEA":
13
14 awk 'BEGIN{FS="\t";OFS="";ORS="";nlines=0} ARGIND==1 && FNR>1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression;
15
16 if [ ! -e $outExpression ]; then
17 printf "[ERROR]Expression file failed" >> $log;
18 exit 10;
19 fi
20 ;
21 awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes;
22
23 if [ ! -e $outPhenotypes ]; then
24 printf "[ERROR]Phenotype file failed" >> $log;
25 exit 10;
26 fi
27 ;
28 #else:
29
30 if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then
31 printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log;
32 exit 10;
33 fi
34 ;
35
36 awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)<pvalTh){if(rkIndice=="Tstat"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)<pvalTh){if(rkIndice=="FC"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt;
37 printf "#NAME\tSCORE\n" > $outRankedGenes;
38 LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes;
39 rm ./temp.txt;
40
41 if [ ! -e $outRankedGenes ]; then
42 printf "[ERROR]Rank file failed" >> $log;
43 exit 10;
44 fi
45 ;
46 #end if
47
48 printf "[INFO]End of tool script" >> $log;
49 ]]>
50 </command>
51 <inputs>
52 <param type="text" name="title" value="GSEAformat_toPersonalize" label="Title for output (without space)"/>
53 <conditional name="mainCondition">
54 <param name="selection" type="select" label="GSEA configuration" force_select="true">
55 <option value="classicGSEA">GSEA analysis </option>
56 <option value="rankedGSEA">Pre-ranked GSEA analysis</option>
57 </param>
58 <when value="classicGSEA">
59 <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" optional="false" multiple="false">
60 </param>
61 <param type="data" name="conditionInformation" format="tabular" label="Factor information tabular file" optional="false" multiple="false">
62 </param>
63 <param name="factorToInclude" type="select" label="Reference factor" multiple="false" optional="false" refresh_on_change="true"
64 dynamic_options="get_column_names_filteredList(mainCondition['conditionInformation'].file_name,[0])">
65 </param>
66 </when>
67 <when value="rankedGSEA">
68 <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" multiple="false" help="This file should contain only annotated gene names or only probe identifiers as rows but no both kinds.">
69 </param>
70 <param name="comparisonsToUse" type="select" label="Reference contrast" optional="false" multiple="false" refresh_on_change="true" dynamic_options="get_column_names_filteredList(mainCondition['differentialAnalysis'].file_name,[0,1],5)">
71 <validator type="empty_field" message="You should specify one factor"></validator>
72 </param>
73 <param type="select" name="rankingIndice" display="radio" label="Reference statistic">
74 <option value="FC">Relative value of Log2(Fold Change)</option>
75 <option value="absFC">Absolute value of Log2(Fold Change)</option>
76 <option value="Tstat">Relative value of moderated t-statistic</option>
77 <option value="absTstat">Absolute value of moderated t-statistic</option>
78 </param>
79 <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold">
80 <validator type="in_range" min="0" max="1" exclude_min="true" message="Threshold should be between 0 and 1"/>
81 </param>
82 </when>
83 </conditional>
84 </inputs>
85
86 <outputs>
87 <data format="gct" name="outExpression" label="${title}_Expressions">
88 <filter>mainCondition['selection']=="classicGSEA"</filter>
89 </data>
90
91 <data format="cls" name="outPhenotypes" label="${title}_Phenotypes">
92 <filter>mainCondition['selection']=="classicGSEA"</filter>
93 </data>
94
95 <data format="rnk" name="outRankedGenes" label="${title}_Ranked_Genes">
96 <filter>mainCondition['selection']=="rankedGSEA"</filter>
97 </data>
98
99 <data format="txt" name="log" label="${title}_Log" />
100 </outputs>
101
102 <tests>
103 <test maxseconds="3600" >
104 <param name="selection" value="classicGSEA" />
105 <param name="expressionData" value="./NormalizedData.tabular" />
106 <param name="conditionInformation" value="./conditionGroups.txt" />
107 <param name="factorToInclude" value="Treatment" />
108 <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" />
109 <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesTreatment.cls" />
110 <output name="log" file="./GSEA-Formatting/outputRanks.log" />
111 </test>
112 <test maxseconds="3600" >
113 <param name="selection" value="classicGSEA" />
114 <param name="expressionData" value="./NormalizedData.tabular" />
115 <param name="conditionInformation" value="./conditionGroups.txt" />
116 <param name="factorToInclude" value="Type" />
117 <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" />
118 <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesType.cls" />
119 <output name="log" file="./GSEA-Formatting/outputRanks.log" />
120 </test>
121 <test maxseconds="3600" >
122 <param name="selection" value="rankedGSEA" />
123 <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" />
124 <param name="comparisonsToUse" value="WT*Control-KO*Control" />
125 <param name="rankingIndice" value="FC" />
126 <param name="pvalThreshold" value="0.05" />
127 <output name="outRankedGenes" file="./GSEA-Formatting/outputRanks.rnk" />
128 <output name="log" file="./GSEA-Formatting/outputRanks.log" />
129 </test>
130 </tests>
131 <help>
132 <![CDATA[
133 **What it does**
134
135 Generate input files required for GSEA analysis.
136
137 -----
138
139 **Parameters**
140
141 \- **Title** to personalize output file names (please avoid special characters).
142
143 \- **GSEA configuration** : "GSEA analysis" requires expression dataset and phenotype label for each sample whereas "Pre-ranked GSEA Analysis" needs ranked list of samples extracted from differential analysis results.
144
145 - **GSEA Analysis**
146
147 \- **Expression tabular file** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers).
148
149 ::
150
151 Conditions 157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL 155_(HuGene-2_0-st).CEL 154_(HuGene-2_0-st).CEL
152 DDX11L2 4.500872 4.429759 4.780281 4.996189
153 MIR1302-2 3.415065 3.520472 3.471503 3.567988
154 OR4F5 3.737956 3.011586 3.424494 3.497545
155 VWA1 5.189621 5.129595 4.806793 5.227014
156
157 \- **Factor information tabular file** with factors as columns and samples as rows (header row contains factor names and first column sample names).
158
159 ::
160
161 Conditions Sex Treatment Reaction
162 138_(HuGene-2_0-st).CEL 1 TreatA Pos
163 148_(HuGene-2_0-st).CEL 0 NoTreat Pos
164 139_(HuGene-2_0-st).CEL 0 TreatB Neg
165 149_(HuGene-2_0-st).CEL 0 NoTreat Neg
166
167 \- **Reference factor** to use as phenotype in GSEA amongst available factors in factor information file.
168
169 - **Pre-ranked GSEA Analysis**
170
171 \- **Differential analysis tabular file** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and moderated t-statistic) as columns and genes as rows (first and second rows contain contrasts definition and first and second columns contain gene identifiers and functional informations). Please respect the GIANT-Differential Expression Analysis with LIMMA tool output format.
172
173 ::
174
175 LIMMA comparison WT*Treat WT*Treat WT*Treat WT*Treat WT*Treat
176 Gene Info p-val FDR.p-val FC log2(FC) t-stat
177 ARSD na 0.0057 0.41 0.8389 -0.2534 -5.175
178 TTTY10 na 1.6e-07 0.0074 0.6403 -0.6432 -6.122
179 MIR548AL na 0.072 0.2914 1.711 0.775 10.43
180
181
182 \- **Reference contrast** from available contrasts in differential analysis file to use for gene ranking.
183
184 \- **Reference statistic** from reference contrast used to rank genes. Relative or absolute value of log2(FC) or moderated t-statistic is used to sort genes in a decreasing way.
185
186 \- **FDR p-val threshold** to discard genes with higher FDR p-val than specific threshold [0.05 recommended]. Genes with high FDR p-val are not significant for differential expression in reference contrast.
187
188
189 -----
190
191 **Outputs**
192
193 Depends on GSEA configuration:
194
195 - **GSEA Analysis**
196
197 \- **phenotype file (.cls)** to use as "phenotype labels" file for GSEA.
198
199 \- **expression file (.gct)** to use as "expression dataset" file for GSEA.
200
201 - **Pre-ranked GSEA Analysis**
202
203 \- **pre-ranked file (.rnk)** to use as "ranked list" file for GSEA pre-ranked.
204
205 For all configurations:
206
207 \- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report.
208 ]]>
209 </help>
210
211 <citations>
212 <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870}
213 }</citation>
214
215 <citation type="bibtex">@article {Subramanian15545,
216 author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.},
217 title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles},
218 volume = {102},
219 number = {43},
220 pages = {15545--15550},
221 year = {2005},
222 publisher = {National Academy of Sciences},
223 issn = {0027-8424},
224 journal = {Proceedings of the National Academy of Sciences}
225 }</citation>
226 </citations>
227
228 </tool>