Mercurial > repos > vandelj > giant_gsea_format
comparison galaxy/wrappers/FormatForGSEA.xml @ 0:3022feec50fe draft
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author | vandelj |
---|---|
date | Fri, 26 Jun 2020 09:36:46 -0400 |
parents | |
children | d72f1bc5ce9e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3022feec50fe |
---|---|
1 <tool name="GIANT-GSEA Formatting" id="giant_gsea_format" version="0.2.0"> | |
2 <description>Format input files for GSEA software</description> | |
3 <code file="../../src/General_functions.py"/> | |
4 <stdio> | |
5 <regex match="Execution halted" | |
6 source="both" | |
7 level="fatal" | |
8 description="Execution halted." /> | |
9 <exit_code range="10" level="fatal" description="Error during file generation, see log file for more information." /> | |
10 </stdio> | |
11 <command> <![CDATA[ | |
12 #if $mainCondition.selection=="classicGSEA": | |
13 | |
14 awk 'BEGIN{FS="\t";OFS="";ORS="";nlines=0} ARGIND==1 && FNR>1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression; | |
15 | |
16 if [ ! -e $outExpression ]; then | |
17 printf "[ERROR]Expression file failed" >> $log; | |
18 exit 10; | |
19 fi | |
20 ; | |
21 awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes; | |
22 | |
23 if [ ! -e $outPhenotypes ]; then | |
24 printf "[ERROR]Phenotype file failed" >> $log; | |
25 exit 10; | |
26 fi | |
27 ; | |
28 #else: | |
29 | |
30 if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then | |
31 printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log; | |
32 exit 10; | |
33 fi | |
34 ; | |
35 | |
36 awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)<pvalTh){if(rkIndice=="Tstat"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)<pvalTh){if(rkIndice=="FC"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt; | |
37 printf "#NAME\tSCORE\n" > $outRankedGenes; | |
38 LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes; | |
39 rm ./temp.txt; | |
40 | |
41 if [ ! -e $outRankedGenes ]; then | |
42 printf "[ERROR]Rank file failed" >> $log; | |
43 exit 10; | |
44 fi | |
45 ; | |
46 #end if | |
47 | |
48 printf "[INFO]End of tool script" >> $log; | |
49 ]]> | |
50 </command> | |
51 <inputs> | |
52 <param type="text" name="title" value="GSEAformat_toPersonalize" label="Title for output (without space)"/> | |
53 <conditional name="mainCondition"> | |
54 <param name="selection" type="select" label="GSEA configuration" force_select="true"> | |
55 <option value="classicGSEA">GSEA analysis </option> | |
56 <option value="rankedGSEA">Pre-ranked GSEA analysis</option> | |
57 </param> | |
58 <when value="classicGSEA"> | |
59 <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" optional="false" multiple="false"> | |
60 </param> | |
61 <param type="data" name="conditionInformation" format="tabular" label="Factor information tabular file" optional="false" multiple="false"> | |
62 </param> | |
63 <param name="factorToInclude" type="select" label="Reference factor" multiple="false" optional="false" refresh_on_change="true" | |
64 dynamic_options="get_column_names_filteredList(mainCondition['conditionInformation'].file_name,[0])"> | |
65 </param> | |
66 </when> | |
67 <when value="rankedGSEA"> | |
68 <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" multiple="false" help="This file should contain only annotated gene names or only probe identifiers as rows but no both kinds."> | |
69 </param> | |
70 <param name="comparisonsToUse" type="select" label="Reference contrast" optional="false" multiple="false" refresh_on_change="true" dynamic_options="get_column_names_filteredList(mainCondition['differentialAnalysis'].file_name,[0,1],5)"> | |
71 <validator type="empty_field" message="You should specify one factor"></validator> | |
72 </param> | |
73 <param type="select" name="rankingIndice" display="radio" label="Reference statistic"> | |
74 <option value="FC">Relative value of Log2(Fold Change)</option> | |
75 <option value="absFC">Absolute value of Log2(Fold Change)</option> | |
76 <option value="Tstat">Relative value of moderated t-statistic</option> | |
77 <option value="absTstat">Absolute value of moderated t-statistic</option> | |
78 </param> | |
79 <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold"> | |
80 <validator type="in_range" min="0" max="1" exclude_min="true" message="Threshold should be between 0 and 1"/> | |
81 </param> | |
82 </when> | |
83 </conditional> | |
84 </inputs> | |
85 | |
86 <outputs> | |
87 <data format="gct" name="outExpression" label="${title}_Expressions"> | |
88 <filter>mainCondition['selection']=="classicGSEA"</filter> | |
89 </data> | |
90 | |
91 <data format="cls" name="outPhenotypes" label="${title}_Phenotypes"> | |
92 <filter>mainCondition['selection']=="classicGSEA"</filter> | |
93 </data> | |
94 | |
95 <data format="rnk" name="outRankedGenes" label="${title}_Ranked_Genes"> | |
96 <filter>mainCondition['selection']=="rankedGSEA"</filter> | |
97 </data> | |
98 | |
99 <data format="txt" name="log" label="${title}_Log" /> | |
100 </outputs> | |
101 | |
102 <tests> | |
103 <test maxseconds="3600" > | |
104 <param name="selection" value="classicGSEA" /> | |
105 <param name="expressionData" value="./NormalizedData.tabular" /> | |
106 <param name="conditionInformation" value="./conditionGroups.txt" /> | |
107 <param name="factorToInclude" value="Treatment" /> | |
108 <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" /> | |
109 <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesTreatment.cls" /> | |
110 <output name="log" file="./GSEA-Formatting/outputRanks.log" /> | |
111 </test> | |
112 <test maxseconds="3600" > | |
113 <param name="selection" value="classicGSEA" /> | |
114 <param name="expressionData" value="./NormalizedData.tabular" /> | |
115 <param name="conditionInformation" value="./conditionGroups.txt" /> | |
116 <param name="factorToInclude" value="Type" /> | |
117 <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" /> | |
118 <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesType.cls" /> | |
119 <output name="log" file="./GSEA-Formatting/outputRanks.log" /> | |
120 </test> | |
121 <test maxseconds="3600" > | |
122 <param name="selection" value="rankedGSEA" /> | |
123 <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" /> | |
124 <param name="comparisonsToUse" value="WT*Control-KO*Control" /> | |
125 <param name="rankingIndice" value="FC" /> | |
126 <param name="pvalThreshold" value="0.05" /> | |
127 <output name="outRankedGenes" file="./GSEA-Formatting/outputRanks.rnk" /> | |
128 <output name="log" file="./GSEA-Formatting/outputRanks.log" /> | |
129 </test> | |
130 </tests> | |
131 <help> | |
132 <![CDATA[ | |
133 **What it does** | |
134 | |
135 Generate input files required for GSEA analysis. | |
136 | |
137 ----- | |
138 | |
139 **Parameters** | |
140 | |
141 \- **Title** to personalize output file names (please avoid special characters). | |
142 | |
143 \- **GSEA configuration** : "GSEA analysis" requires expression dataset and phenotype label for each sample whereas "Pre-ranked GSEA Analysis" needs ranked list of samples extracted from differential analysis results. | |
144 | |
145 - **GSEA Analysis** | |
146 | |
147 \- **Expression tabular file** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers). | |
148 | |
149 :: | |
150 | |
151 Conditions 157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL 155_(HuGene-2_0-st).CEL 154_(HuGene-2_0-st).CEL | |
152 DDX11L2 4.500872 4.429759 4.780281 4.996189 | |
153 MIR1302-2 3.415065 3.520472 3.471503 3.567988 | |
154 OR4F5 3.737956 3.011586 3.424494 3.497545 | |
155 VWA1 5.189621 5.129595 4.806793 5.227014 | |
156 | |
157 \- **Factor information tabular file** with factors as columns and samples as rows (header row contains factor names and first column sample names). | |
158 | |
159 :: | |
160 | |
161 Conditions Sex Treatment Reaction | |
162 138_(HuGene-2_0-st).CEL 1 TreatA Pos | |
163 148_(HuGene-2_0-st).CEL 0 NoTreat Pos | |
164 139_(HuGene-2_0-st).CEL 0 TreatB Neg | |
165 149_(HuGene-2_0-st).CEL 0 NoTreat Neg | |
166 | |
167 \- **Reference factor** to use as phenotype in GSEA amongst available factors in factor information file. | |
168 | |
169 - **Pre-ranked GSEA Analysis** | |
170 | |
171 \- **Differential analysis tabular file** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and moderated t-statistic) as columns and genes as rows (first and second rows contain contrasts definition and first and second columns contain gene identifiers and functional informations). Please respect the GIANT-Differential Expression Analysis with LIMMA tool output format. | |
172 | |
173 :: | |
174 | |
175 LIMMA comparison WT*Treat WT*Treat WT*Treat WT*Treat WT*Treat | |
176 Gene Info p-val FDR.p-val FC log2(FC) t-stat | |
177 ARSD na 0.0057 0.41 0.8389 -0.2534 -5.175 | |
178 TTTY10 na 1.6e-07 0.0074 0.6403 -0.6432 -6.122 | |
179 MIR548AL na 0.072 0.2914 1.711 0.775 10.43 | |
180 | |
181 | |
182 \- **Reference contrast** from available contrasts in differential analysis file to use for gene ranking. | |
183 | |
184 \- **Reference statistic** from reference contrast used to rank genes. Relative or absolute value of log2(FC) or moderated t-statistic is used to sort genes in a decreasing way. | |
185 | |
186 \- **FDR p-val threshold** to discard genes with higher FDR p-val than specific threshold [0.05 recommended]. Genes with high FDR p-val are not significant for differential expression in reference contrast. | |
187 | |
188 | |
189 ----- | |
190 | |
191 **Outputs** | |
192 | |
193 Depends on GSEA configuration: | |
194 | |
195 - **GSEA Analysis** | |
196 | |
197 \- **phenotype file (.cls)** to use as "phenotype labels" file for GSEA. | |
198 | |
199 \- **expression file (.gct)** to use as "expression dataset" file for GSEA. | |
200 | |
201 - **Pre-ranked GSEA Analysis** | |
202 | |
203 \- **pre-ranked file (.rnk)** to use as "ranked list" file for GSEA pre-ranked. | |
204 | |
205 For all configurations: | |
206 | |
207 \- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report. | |
208 ]]> | |
209 </help> | |
210 | |
211 <citations> | |
212 <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870} | |
213 }</citation> | |
214 | |
215 <citation type="bibtex">@article {Subramanian15545, | |
216 author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.}, | |
217 title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles}, | |
218 volume = {102}, | |
219 number = {43}, | |
220 pages = {15545--15550}, | |
221 year = {2005}, | |
222 publisher = {National Academy of Sciences}, | |
223 issn = {0027-8424}, | |
224 journal = {Proceedings of the National Academy of Sciences} | |
225 }</citation> | |
226 </citations> | |
227 | |
228 </tool> |