0
|
1 ###############################################################################
|
|
2 # title: msigdb_kegg_geneSet
|
|
3 # author: Xiaowei
|
|
4 # time: Jan.7 2021
|
|
5 # function: kegg.download, path.to.geneSet, kegg.geneSet
|
|
6 ###############################################################################
|
|
7
|
|
8
|
|
9 ###############################################################################
|
|
10 # load packages
|
|
11 ###############################################################################
|
|
12 suppressPackageStartupMessages(library(KEGGREST))
|
|
13 suppressPackageStartupMessages(library(GSEABase))
|
|
14
|
|
15
|
|
16 ###############################################################################
|
|
17 # Function -- kegg.download
|
|
18 # description: download pathway from kegg
|
|
19 # input: One or more KEGG identifiers
|
|
20 # output: A list wrapping a KEGG flat file.
|
|
21 ###############################################################################
|
|
22 kegg.download <- function(pathway.list){
|
|
23 #因为keggGet()一次最大查询10条,所以这里先将pathway.list分成10*n个,每10个放在pathway.x中
|
|
24 #确定pathway.x长度
|
|
25 if (length(pathway.list)%%10 == 0){
|
|
26 pathway.x.len <- length(pathway.list)%/%10
|
|
27 }else{
|
|
28 pathway.x.len <- length(pathway.list)%/%10 + 1
|
|
29 }
|
|
30 #pathway.x,每个中包含10个pathway
|
|
31 pathway.x <- vector(mode = "list", length = pathway.x.len)
|
|
32 for (i in 1:length(pathway.x)){
|
|
33 min.x <- 10*(i -1)+1
|
|
34 max.x <- 10*i
|
|
35 pathway.x[[i]] <- names(pathway.list)[min.x:max.x]
|
|
36 rm(min.x,max.x)
|
|
37 }
|
|
38 #下载n次,每次下载10个pathway
|
|
39 pathway.kegg <- list()
|
|
40 for (i in 1:pathway.x.len){
|
|
41 pathway.kegg.x <- keggGet(pathway.x[[i]])
|
|
42 pathway.kegg <- append(pathway.kegg, pathway.kegg.x)
|
|
43 rm(pathway.kegg.x)
|
|
44 }
|
|
45
|
|
46 names(pathway.kegg) <- unlist(lapply(names(pathway.list), function(x){trimws(strsplit(x, ':', fixed = TRUE)[[1]][2])} ))
|
|
47
|
|
48 return(pathway.kegg)
|
|
49
|
|
50 }
|
|
51
|
|
52 ###############################################################################
|
|
53 # Function -- path.to.GeneSet
|
|
54 # description: make the result of kegg.download to GeneSet
|
|
55 # input: the result of kegg.download
|
|
56 # Output: A GeneSet object
|
|
57 ###############################################################################
|
|
58 path.to.geneSet <- function(kegg.path){
|
|
59 genes <- kegg.path$GENE
|
|
60
|
|
61 if(!is.null(genes)){
|
|
62 genelist_entrez <- genes[1:length(genes)%%2 ==1] #entrez
|
|
63
|
|
64 gs <- GeneSet(geneIds = as.character(genelist_entrez),
|
|
65 geneIdType = EntrezIdentifier(),
|
|
66 #organism = 'hsa',
|
|
67 collectionType = KEGGCollection(),
|
|
68 #longDescription = kegg.path$DESCRIPTION,
|
|
69 #shortDescription = names(kegg.path$PATHWAY_MAP),
|
|
70 setName = kegg.path$PATHWAY_MAP,
|
|
71 )
|
|
72 }else{gs = NULL}
|
|
73
|
|
74 return(gs)
|
|
75 }
|
|
76
|
|
77 ###############################################################################
|
|
78 # Function -- kegg.geneSet
|
|
79 # description: download pathway of one organism from KEGG and make it as GeneSetCollection
|
|
80 # input: a KEGG organism code (list via keggList("organism"))
|
|
81 # output: A GeneSetCollection object or/and GMT file, gene ID is Entrez
|
|
82 ###############################################################################
|
|
83
|
|
84 kegg.geneSet <- function(organism = "hsa", outputfile = NULL){
|
|
85 pathway.list <- keggList("pathway", organism) #获取所有pathway的名称和kegg标识符
|
|
86
|
|
87 kegg.path <- kegg.download(pathway.list = pathway.list) #download pathway
|
|
88
|
|
89 kegg.geneSetCollection <- mapply(path.to.geneSet, kegg.path) #pathway to GeneSet
|
|
90
|
|
91 null.index <- unlist(lapply(kegg.geneSetCollection, is.null)) #remove NULL
|
|
92 kegg.geneSetCollection <- GeneSetCollection(kegg.geneSetCollection[!null.index])
|
|
93
|
|
94 #导出为gmt文件
|
|
95 if (!is.null(outputfile)){toGmt(x=kegg.geneSetCollection, con = outputfile)}
|
|
96
|
|
97 return(kegg.geneSetCollection)
|
|
98 }
|
|
99
|
|
100 ###############################################################################
|
|
101 # Function -- msigdbr.geneSet
|
|
102 # description: download pathway of one organism from msigdb and make it as GeneSetCollection
|
|
103 # input:
|
|
104 # species: Species name, such as Homo sapiens or Mus musculus,See more via msigdbr_species()
|
|
105 # category: MSigDB collection abbreviation, such as H or C1. See more via msigdbr_collections()
|
|
106 # subcategory: MSigDB sub-collection abbreviation, such as CGP or BP.
|
|
107 # if it has more than one subcategories using `,` connected the words. example: "GO:BP,GO:CC,GO:MF", See more via msigdbr_collections()
|
|
108 # geneIdType: Default as "entrez". one of "entrez" and "symbol"
|
|
109 # output: A GeneSetCollection object or/and GMT file
|
|
110 ###############################################################################
|
|
111 msigdb.geneSet <- function(species, category = NULL, subcategory = NULL, geneIdType ="entrez", outputfile = NULL){
|
|
112 suppressPackageStartupMessages(library(msigdbr))
|
|
113 suppressPackageStartupMessages(library(GSEABase))
|
|
114 print(subcategory)
|
|
115 # 下载基因集数据框
|
|
116 if (is.null(subcategory)){
|
|
117 gs = msigdbr(species = species, category = category, subcategory = NULL)
|
|
118 }else{
|
|
119 subcategory = strsplit(subcategory, ",")[[1]]
|
|
120 gs.list <- lapply(subcategory, function(x) msigdbr(species = species, category = category, subcategory = x))
|
|
121 gs = do.call(rbind, gs.list)
|
|
122 }
|
|
123
|
|
124 if (geneIdType == "entrez"){
|
|
125 genes = gs$entrez_gene
|
|
126 geneIdsType = EntrezIdentifier()
|
|
127 }else if (geneIdType == "symbol"){
|
|
128 genes = gs$gene_symbol
|
|
129 geneIdsType = SymbolIdentifier()
|
|
130 }
|
|
131
|
|
132 # 根据GeneSet name 转变成list
|
|
133 gs.names <- unique(gs$gs_name)
|
|
134 gs.list <- lapply(gs.names, function(x){
|
|
135 unique(genes[which(gs$gs_name == x)])
|
|
136 })
|
|
137 names(gs.list) = gs.names
|
|
138
|
|
139 # 变成GeneSetCollection
|
|
140 gs.geneSetCollection <- GeneSetCollection(mapply(function(x,y){
|
|
141 genes = as.character(unlist(x))
|
|
142 GeneSet(geneIds = genes,
|
|
143 geneIdType = geneIdsType,
|
|
144 collectionType = NullCollection(),
|
|
145 setName = y
|
|
146 )
|
|
147 }, gs.list, gs.names))
|
|
148
|
|
149 #导出为gmt文件
|
|
150 if (!is.null(outputfile)){toGmt(x=gs.geneSetCollection, con = outputfile)}
|
|
151
|
|
152 return(gs.geneSetCollection)
|
|
153 }
|
|
154
|
|
155
|
|
156 #=================================================================
|
|
157 #how to pass parameters
|
|
158 #=================================================================
|
|
159 spec <- matrix(c("source","S", 1, "character", "KEGG or Msigdb",
|
|
160 "organism","O",1, "character", "a KEGG organism code",
|
|
161 "species","P",1,"character", "Species name, such as Homo sapiens or Mus musculus",
|
|
162 "category","C",1,"character", "MSigDB collection abbreviation, such as H or C1.",
|
|
163 "subcategory","G",1,"character", "MSigDB sub-collection abbreviation, such as CGP or BP.",
|
|
164 "geneIdType","T",1, "character", "Default as 'entrez'. one of 'entrez' and 'symbol'",
|
|
165 "whetherOutputfile", "W",0, "logical", "Whether output GMT file",
|
|
166 "outputGMTFile","F",1, "character", "The GMT file Name",
|
|
167 "outputRData", "R", 1,"character", "The RData file for the GeneSet "), byrow = T, ncol = 5)
|
|
168
|
|
169 opt <- getopt::getopt(spec)
|
|
170
|
|
171 #=================================================================
|
|
172 # run codes
|
|
173 #=================================================================
|
|
174 if (opt$source == "KEGG"){
|
|
175 geneSet <- kegg.geneSet(opt$organism, outputfile = opt$outputFile)
|
|
176 }
|
|
177
|
|
178 if (opt$source == "Msigdb"){
|
|
179 if(is.null(opt$category)){opt$category = NULL}
|
|
180 if(is.null(opt$subcategory)){opt$subcategory = NULL}
|
|
181 if(is.null(opt$geneIdType)){opt$geneIdType = "entrez"}
|
|
182
|
|
183 geneSet <- msigdb.geneSet(species = opt$species,
|
|
184 category = opt$category,
|
|
185 subcategory = opt$subcategory,
|
|
186 geneIdType = opt$geneIdType)
|
|
187 }
|
|
188
|
|
189 #=================================================================
|
|
190 # Output file
|
|
191 #=================================================================
|
|
192 if (!is.null(opt$whetherOutputfile)){
|
|
193 if (opt$whetherOutputfile){toGmt(x=geneSet, con = opt$outputGMTFile)}
|
|
194 }
|
|
195
|
|
196
|
|
197 save(geneSet, file = opt$outputRData)
|