Mercurial > repos > xiaowei > gsva

--- a/GSVA.R	Thu Apr 01 10:19:31 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,150 +0,0 @@
-#=================================================================
-#how to pass parameters
-#=================================================================
-spec <- matrix(c("expr", 'E', 1, 'character', 'Gene expression data which is an CSV file of expression values where rows correspond to genes and columns correspond to samples.',
-                 "geneSet", 'G', 1, 'character', 'Gene set',
-                 'gene_Identifier_class','C', 1, 'character', 'Gene Identifier class of GeneSet',
-                 'method', "M", 1,'character', 'One of gsva, ssgsea, zscore, plage',
-                 'img_file', 'I', 1,'character', 'img_file',
-                 'img_type', 'T', 1,'character', 'PDF, PNG, JPEG',
-                 'img_width', 'W', 1, 'integer', 'the img file width',
-                 'img_height', 'H', 1, 'integer', 'the img file height',
-                 'GSVA_result', 'R', 1, 'character', 'Result of GSVA, an CSV file.'
-
-                 ),
-               byrow = TRUE, ncol = 5)
-
-
-if (!requireNamespace("getopt", quietly = TRUE))
-  install.packages("getopt")
-
-opt <- getopt::getopt(spec)
-
-#----------------
-#整理参数
-#----------------
-
-if(is.null(opt$gene_Identifier_class)){gene_Identifier_class = 'Symbol'}else{gene_Identifier_class = opt$gene_Identifier_class }
-if(is.null(opt$method)){opt$method = 'gsva'}
-if(is.null(opt$img_type)){opt$img_type = 'PNG'}
-if(is.null(opt$img_width)){img_width = 900}else{img_width = opt$img_width}
-if(is.null(opt$img_height)){img_height = 900}else{img_height = opt$img_height}
-
-#================================================================
-#run codes
-#================================================================
-gsva_input_data <- read.csv(opt$expr, row.names = 1)
-
-# if (gene_Identifier_class == 'Symbol'){
-#   geneset <- GSEABase::getGmt(opt$geneSet,
-#                               geneIdType = GSEABase::SymbolIdentifier())
-# }else{
-#   geneset <- GSEABase::getGmt(opt$geneSet,
-#                               geneIdType = GSEABase::EntrezIdentifier())
-# }
-
-load(opt$geneSet)
-
-result <- GSVA::gsva(as.matrix(gsva_input_data), geneSet, mx.diff=FALSE,
-               verbose=FALSE, parallel.sz=2, method = opt$method)
-#================================================================
-#output
-#================================================================
-
-write.csv(result, file = opt$GSVA_result)
-
-if (opt$img_type == 'PNG'){
-  png(filename = opt$img_file, width = img_width, height = img_height)
-}else if(opt$img_type == 'JPG'){
-  jpeg(filename = opt$img_file, width = img_width, height = img_height)
-}else{
-  pdf(file = opt$img_file, width = img_width, height = img_height)
-}
-
-pheatmap::pheatmap(result, scale = "row", main = "heatmap", show_colnames=T)
-dev.off()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#参考：
-#https://nbviewer.jupyter.org/github/mora-lab/benchmarks/blob/master/single-sample/workflows/Tutorial%20of%20GSVA%20using%20data%20GSE10245.ipynb
-
-#<requirement type="package" version="1.0.12">r-pheatmap</requirement>
-
-# library(GSVA)
-#
-# p <- 20000    ## number of genes
-# n <- 30       ## number of samples
-# nGS <- 100    ## number of gene sets
-# min.sz <- 10  ## minimum gene set size
-# max.sz <- 100 ## maximum gene set size
-# X <- matrix(rnorm(p*n), nrow=p, dimnames=list(1:p, 1:n))
-# dim(X)
-# gs <- as.list(sample(min.sz:max.sz, size=nGS, replace=TRUE)) ## sample gene set sizes
-# gs <- lapply(gs, function(n, p) sample(1:p, size=n, replace=FALSE), p) ## sample gene sets
-# es.max <- gsva(X, gs, mx.diff=FALSE, verbose=FALSE, parallel.sz=1)
-# es.dif <- gsva(X, gs, mx.diff=TRUE, verbose=FALSE, parallel.sz=1)
-
-#boss说只需要输出行为pathway列为sample的矩阵和其heatmap就可以了
-
-#
-# gsva(expr,  #基因表达数据，可以是SummarizedExperiment或ExpressionSet对象，也可以是行为基因列为样本的矩阵
-#      gset.idx.list,  #基因集，list或GeneSetCollection 对象
-#      annotation, #基因表达数据为SummarizedExperiment对象时，用来挑选矩阵中的分子数据； 当基因表达数据为matrix，基因集为GeneSetCollection对象时，annotation参数用来注释矩阵的注释信息。 当表达数据是ExpressionSet时，该参数忽略。
-#      method=c("gsva", "ssgsea", "zscore", "plage"), #用于估计每个样本的基因集富集分数的方法
-#      kcdf=c("Gaussian", "Poisson", "none"), #表示样本间表达式水平累积分布函数非参数估计时使用的内核字符串。当基因表达值是连续时(对数)，使用"Gaussian", 当基因表达值是整数计数时(非对数)，使用"Poisson"
-#      abs.ranking=FALSE, #当mx.diff=TRUE时才会用到该参数。 abs.ranking=FALSE(默认值)，使用修正的Kuiper统计量来计算富集分数，取最大的正随机游走偏差和负随机游走偏差之间的量级差。 当abs.ranking=TRUE时，使用最大的正的和负的随机游走偏差之和的原始Kuiper统计量。
-#      min.sz=1, #gene set的最小基因数
-#      max.sz=Inf, #gene set的最大基因数
-#      parallel.sz=1L, #并行计算时要使用的执行线程数。 参数BPPARAM允许设置并行后端并优化其配置。
-#      mx.diff=TRUE, #提出从KS随机游走统计量计算富集统计量的两种方法。 mx.diff=FALSE, 计算为随机游走到0的最大距离。 TRUE（默认）时，计算最大的正的和负的随机游走偏差之间的幅度差
-#      tau=switch(method, gsva=1, ssgsea=0.25, NA), #定义由gsva和ssgsea方法执行的随机游走中尾部权重的指数。
-#      ssgsea.norm=TRUE, #当为TRUE时，运行ssgsea时，通过最小值和最大值之间的绝对差对分数进行标准化。否则忽略这步标准化
-#      verbose=TRUE, 是否打印输出每一步计算，默认是false
-#      BPPARAM=SerialParam(progressbar=verbose))  #BiocParallelParam类的一个对象，指定与此函数内的一些任务和计算的并行执行相关的参数。
-
-
-###########################
-#Input
-###########################
-#表达数据
-#gene set， kegg的那些
-#method  "gsva", "ssgsea", "zscore", "plage" 之一
-
-######################################################
-#boss要求输出-- 矩阵数据，热图， limma做的通路差异表达
-######################################################
-
-
-#要求第一列是基因名称，第二列开始是样本
-# gsva_input_data <- read.csv("GSVA_input.csv", row.names = 1)
-#
-# geneset <- GSEABase::getGmt("example_pathway.gmt",
-#                             geneIdType = SymbolIdentifier())
-#
-# result <- gsva(as.matrix(gsva_input_data), geneset, mx.diff=FALSE,
-#                verbose=FALSE, parallel.sz=2, method = "plage")
-#
-# write.csv(result, file = "GSVA_result.csv")
-
-
-
--- a/GSVA.xml	Thu Apr 01 10:19:31 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,155 +0,0 @@
-<tool id="GSVA" name="Gene Set Variation Analysis" version="0.1.0">
-    <description>Estimates GSVA enrichment scores</description>
-    <requirements>
-        <requirement type="package" version="1.38.0">bioconductor-GSVA</requirement>
-        <requirement type="package" version="1.0.12">r-pheatmap</requirement>
-        <requirement type="package" version="1.20.3">r-getopt</requirement>
-    </requirements>
-    <command detect_errors="exit_code"><![CDATA[
-        Rscript '$__tool_directory__/GSVA.R'
-        --expr '$expression_data'
-        --geneSet '$geneSet'
-        --method '$method'
-        --img_type '$imgfile.img_args.img_type'
-        --img_width '$imgfile.img_args.img_width'
-        --img_height '$imgfile.img_args.img_height'
-        --img_file '$output_img_file'
-        --GSVA_result '$GSVA_result'
-    ]]></command>
-    <inputs>
-        <param name="expression_data" type="data" format="csv" label="Gene expression data" help="An CSV file that is a matrix of expression values where rows correspond to genes and columns correspond to samples." />
-
-        <param name="geneSet" type="data" format="rdata" label="Gene Sets" help="An rdata file included geneSetCollection object 'geneSet' as name."/>
-
-        <param name="method" type="select" label="Method" display="radio" help="Supposted four method: GSVA, ssGSEA, z-score and PLAGE. Details in help.">
-            <option value="gsva">GSVA</option>
-            <option value="ssgsea">ssGSEA</option>
-            <option value="zscore">z-score</option>
-            <option value="plage">PLAGE</option>
-        </param>
-
-        <section name="imgfile" title="Heatmap" expanded="false">
-            <conditional name="img_args">
-                <param name="img_type" type="select" label="Heatmap file type">
-                    <option value="PNG" selected="true">PNG</option>
-                    <option value="PDF">PDF</option>
-                    <option value="JPG">JPG</option>
-                </param>
-
-                <when value="PNG">
-                    <param name="img_width" type="integer" value="480" min="480" label="Img width(px)" />
-                    <param name="img_height" type="integer" value="480" min="480" label="Img height(px)" />
-                </when>
-
-                <when value="JPG">
-                    <param name="img_width" type="integer" value="480" min="480" label="Img width(px)" />
-                    <param name="img_height" type="integer" value="480" min="480" label="Img height(px)" />
-                </when>
-
-                <when value="PDF">
-                    <param name="img_width" type="integer" value="7" min="7" label="Img width(inches)" />
-                    <param name="img_height" type="integer" value="7" min="7" label="Img height(inches)" />
-                </when>
-
-            </conditional>
-        </section>
-
-    </inputs>
-    <outputs>
-        <data name="GSVA_result" format="csv" label="GSVA_enrich_result" />
-        <data format="pdf" name="output_img_file" label="GSVA_heatmap">
-            <change_format>
-                <when input="imgfile.img_args.img_type" value="PNG" format="png"/>
-                <when input="imgfile.img_args.img_type" value="JPG" format="jpg"/>
-            </change_format>
-        </data>
-    </outputs>
-	<tests>
-	  <test>
-	    <param name="expression_data" value="gsva_input2_GSE10245.csv" ftype="csv" />
-	    <param name="geneSet" value="GeneSet_from_Msigdb_KEGG.rdata" ftype="rdata" />
-	    <param name="method" value="gsva" />
-	    <section name="imgfile">
-	      <conditional name="img_args">
-	        <param name="img_type" value="PNG" />
-	        <param name="img_width" value="480" />
-	        <param name="img_height" value="480" />
-	      </conditional>
-	    </section>
-	    <output name="GSVA_result" file="GSVA_enrich_result.csv" ftype="csv" />
-	    <output name="output_img_file" file="GSVA_heatmap.png"  ftype="png" />
-	  </test>
-	</tests>
-
-
-    <help><![CDATA[
-
-.. class:: infomark
-
-**What it does**
-
-Gene Set Variation Analysis (GSVA) is a Gene Set Enrichment (GSE) method that estimates variation of pathway activity over a sample population in an unsupervised manner.
-
-This tool is built from function `gsva()` bioconductor package `GSVA`, which included four method to analyze microarray and RNG-seq data.
-
-- **GSVA**: GSVA calculates sample-wise gene set enrichment scores as a function of genes inside and outside the gene set, analogously to a competitive gene set test. And it estimates variation of gene set enrichment over the samples independently of any class label.
-
-- **PLAGSE**: Pathway Level analysis  of Gene Expression (PLAGE) standardizes each gene expression profile over the samples and then estimates the pathway activity profiles for each gene set as the coefficients of the first right-singular vector of the singular value.
-
-- **z-score**: The combined z-score method also standardizes each gene expression profile into z-scores and combine the individual gene z-scores per sample to pathway activity profile.
-
-- **ssGSEA**: The ssGSEA method uses the difference in empirical cumulative distribution functions of gene expression rank inside and outside the gene set to calculate an enrichment statistic per sample which is further normalized by the range of values taken throughout all gene sets amd samples.
-
---------
-
-=========
-**Input**
-=========
-
-
-**Gene expression data**
-
-The input data is an CSV file, which included a matrix of expression values where rows correspond to genes and columns correspond to samples.
-Recommend gene id is Entrez ID.
-
-**Gene Sets**
-
-Gene Sets is an rdata file which included object `geneSet` that is geneSetCollection built by `GSEABase` of bioconductor package. You can use
-GeneSet from Msigdb/KEGG to get this file.
-
-**Method**
-
-Method to employ in the estimation of gene-set enrichment scores per sample. By default this is set to `GSVA` and other options are `ssGSEA`, `z-score` or `PLAGE`. The latter two standardize first expression profiles into z-scores over the samples and, in the case of zscore, it combines them together as their sum divided by the square-root of the size of the gene set, while in the case of plage they are used to calculate the singular value decomposition (SVD) over the genes in the gene set and use the coefficients of the first right-singular vector as pathway activity profile.
-
---------
-
-==========
-**Output**
-==========
-
-**1. A gene-set by sample matrix GSVA enrichment scores**
-
-    ========= ========== ======== ======== ======== ==== =========
-    geneSet     sample_1 sample_2 sample_3 sample_4 ...  sample_n
-    ========= ========== ======== ======== ======== ==== =========
-    pathway_1
-    pathway_2
-    pathway_3
-    pathway_4
-    ...
-    pathway_n
-    ========= ========== ======== ======== ======== ==== =========
-
-
-**2. Heatmap for the matrix GSVA enrichment scores**
-
- You can define the heatmap file type, width and height in the part of input.
-
-    ]]></help>
-
-    <citations>
-        <citation type="doi">10.1186/1471-2105-14-7</citation>
-    </citations>
-
-
-</tool>
\ No newline at end of file