1
|
1 ###############################################################################
|
|
2 # title: Gene set analysis in R
|
|
3 # author: Xiaowei
|
|
4 # time: Mar.31 2021
|
|
5 ###############################################################################
|
|
6 #=================================================================
|
|
7 #how to pass parameters
|
|
8 #=================================================================
|
|
9 spec <- matrix(c("expr_file",'E', 1, 'character', 'Gene expression data which is an CSV file of expression values where rows correspond to genes and columns correspond to samples.',
|
|
10 "geneSet_file", 'G', 1, 'character', 'Gene set',
|
|
11 "design_file", 'D', 1, 'character', 'Design for the samples of expression data',
|
|
12 "min_size", 'I',1, 'numeric','a numeric value indicating the minimum allowed gene set size. Default value is 10.',
|
|
13 "max_size", 'A',1, 'numeric','a numeric value indicating the maximum allowed gene set size. Default value is 500.',
|
|
14 "test_method", 'T',1, 'character', "a character parameter indicating which statistical method to use for testing the gene sets. Must be one of 'GSNCAtest', 'WWtest', 'KStest', 'MDtest', 'RKStest', 'RMDtest'.",
|
|
15 "nperm_number", 'N', 1, 'numeric',"number of permutations used to estimate the null distribution of the test statistic. If not given, a default value 1000 is used.",
|
|
16 "cor_method", 'M', 1, 'character',"a character string indicating which correlation coefficient is to be computed. Possible values are 'pearson' (default), 'spearman' and 'kendall'. Default value is 'pearson'.",
|
|
17 "threshold_value", "V", 1, 'numeric', 'Threshold value for setting significant geneSet.',
|
|
18 "GSAR_output_p_value", 'R', 1, 'character',"P-value table",
|
|
19 "GSAR_output_plot", 'P', 1, 'character',"Plot genes relationships of significant pathway"
|
|
20
|
|
21 ),
|
|
22 byrow = TRUE, ncol = 5)
|
|
23
|
|
24
|
|
25 if (!requireNamespace("getopt", quietly = TRUE))
|
|
26 install.packages("getopt")
|
|
27
|
|
28 opt <- getopt::getopt(spec)
|
|
29
|
|
30 #----------------
|
|
31 #整理参数
|
|
32 #----------------
|
|
33 # expr_file
|
|
34 # geneSet_file
|
|
35 # design_file
|
|
36 if (is.null(opt$min_size)){min_size = 10}else{min_size = opt$min_size}
|
|
37 if (is.null(opt$max_size)){max_size = 500}else{max_size = opt$max_size}
|
|
38 if (is.null(opt$test_method)){test_method = "GSNCAtest"}else{test_method = opt$test_method}
|
|
39 if (is.null(opt$nperm_number)){nperm_number = 1000}else{nperm_number = opt$nperm_number}
|
|
40 if (is.null(opt$cor_method)){cor_method = "pearson"}else{cor_method = opt$cor_method}
|
|
41 if (is.null(opt$threshold_value)){threshold_value = 0.05}else{threshold_value = opt$threshold_value}
|
|
42
|
|
43
|
|
44 #================================================================
|
|
45 #run codes
|
|
46 #================================================================
|
|
47
|
|
48 #--- load package ------------------
|
|
49
|
|
50 suppressPackageStartupMessages(library(GSAR))
|
|
51 suppressPackageStartupMessages(library(GSEABase))
|
|
52 options(stringsAsFactors = FALSE)
|
|
53 #---input --------------------------
|
|
54 # expr
|
|
55 data <- as.matrix(read.csv(opt$expr_file, row.names = 1))
|
|
56
|
|
57 # design
|
|
58 design <- read.csv(opt$design_file, row.names = 1)
|
|
59 group <- design$group
|
|
60 label <- design$label
|
|
61
|
|
62 # geneSet
|
|
63 load(opt$geneSet_file)
|
|
64 geneSetlist <- lapply(geneSet, geneIds)
|
|
65 names(geneSetlist) <- names(geneSet)
|
|
66
|
|
67 #-------GSAR -------------------------
|
|
68 # test.method <- c("GSNCAtest", "WWtest", "KStest", "MDtest", "RKStest", "RMDtest")
|
|
69 # “GSNCAtest”, “WWtest”, “KStest”, “MDtest”, “RKStest”, “RMDtest”
|
|
70 results <- TestGeneSets(object=data, group=group,
|
|
71 geneSets=geneSetlist, min.size=min_size,
|
|
72 max.size=max_size, test=test_method,
|
|
73 nperm = nperm_number)
|
|
74
|
|
75
|
|
76
|
|
77 #================================================================
|
|
78 #output
|
|
79 #================================================================
|
|
80
|
|
81
|
|
82 # output p-value----------------------------------------------------
|
|
83 resutlts1 <- as.data.frame(t(as.data.frame(results)))
|
|
84 colnames(resutlts1) = "P_value"
|
|
85 resutlts1$geneSet <- rownames(resutlts1)
|
|
86 resutlts1 <- resutlts1[,c("geneSet", "P_value")]
|
|
87 resutlts1 <- resutlts1[order(resutlts1$P_value),]
|
|
88 write.csv(resutlts1, file = opt$GSAR_output_p_value, row.names = FALSE)
|
|
89 #-------------------------------------------------------------------
|
|
90
|
|
91
|
|
92
|
|
93 #plot -------------------------------------------------------------
|
|
94 sig.paths <- names(results[results <= threshold_value])
|
|
95
|
|
96
|
|
97 group1 <- unique(design[design$group == 1, "label"])
|
|
98 group2 <- unique(design[design$group == 2, "label"])
|
|
99 allgenes <- rownames(data)
|
|
100
|
|
101 pdf(file = opt$GSAR_output_plot, width = 10.92)
|
|
102 if (length(sig.paths) > 0){
|
|
103 for (sig.path in sig.paths){
|
|
104 path.index <- allgenes %in% geneSetlist[[sig.path]]
|
|
105 ## Plot MST2 for a pathway in two conditions
|
|
106 plotMST2.pathway(object=data[path.index,],
|
|
107 group=group, name=sig.path,
|
|
108 legend.size=1.2, #leg.x=-1.2, leg.y=2,
|
|
109 label.size=1, label.dist=0.8, cor.method=cor_method, group1.name = group1, group2.name = group2)
|
|
110
|
|
111 rm(sig.path, path.index)
|
|
112 }
|
|
113
|
|
114 }
|
|
115 dev.off()
|
|
116
|
|
117
|