annotate msigdb_kegg_geneSet.R @ 1:0c0d33699925 draft default tip

Uploaded
author mora-lab
date Thu, 20 May 2021 08:37:05 +0000
parents 700fc491b2b7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
1 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
2 # title: msigdb_kegg_geneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
3 # author: Xiaowei
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
4 # time: Jan.7 2021
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
5 # function: kegg.download, path.to.geneSet, kegg.geneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
6 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
7
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
8
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
9 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
10 # load packages
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
11 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
12 suppressPackageStartupMessages(library(KEGGREST))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
13 suppressPackageStartupMessages(library(GSEABase))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
14
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
15
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
16 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
17 # Function -- kegg.download
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
18 # description: download pathway from kegg
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
19 # input: One or more KEGG identifiers
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
20 # output: A list wrapping a KEGG flat file.
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
21 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
22 kegg.download <- function(pathway.list){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
23 #因为keggGet()一次最大查询10条,所以这里先将pathway.list分成10*n个,每10个放在pathway.x中
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
24 #确定pathway.x长度
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
25 if (length(pathway.list)%%10 == 0){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
26 pathway.x.len <- length(pathway.list)%/%10
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
27 }else{
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
28 pathway.x.len <- length(pathway.list)%/%10 + 1
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
29 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
30 #pathway.x,每个中包含10个pathway
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
31 pathway.x <- vector(mode = "list", length = pathway.x.len)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
32 for (i in 1:length(pathway.x)){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
33 min.x <- 10*(i -1)+1
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
34 max.x <- 10*i
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
35 pathway.x[[i]] <- names(pathway.list)[min.x:max.x]
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
36 rm(min.x,max.x)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
37 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
38 #下载n次,每次下载10个pathway
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
39 pathway.kegg <- list()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
40 for (i in 1:pathway.x.len){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
41 pathway.kegg.x <- keggGet(pathway.x[[i]])
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
42 pathway.kegg <- append(pathway.kegg, pathway.kegg.x)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
43 rm(pathway.kegg.x)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
44 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
45
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
46 names(pathway.kegg) <- unlist(lapply(names(pathway.list), function(x){trimws(strsplit(x, ':', fixed = TRUE)[[1]][2])} ))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
47
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
48 return(pathway.kegg)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
49
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
50 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
51
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
52 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
53 # Function -- path.to.GeneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
54 # description: make the result of kegg.download to GeneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
55 # input: the result of kegg.download
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
56 # Output: A GeneSet object
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
57 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
58 path.to.geneSet <- function(kegg.path){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
59 genes <- kegg.path$GENE
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
60
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
61 if(!is.null(genes)){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
62 genelist_entrez <- genes[1:length(genes)%%2 ==1] #entrez
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
63
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
64 gs <- GeneSet(geneIds = as.character(genelist_entrez),
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
65 geneIdType = EntrezIdentifier(),
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
66 #organism = 'hsa',
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
67 collectionType = KEGGCollection(),
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
68 #longDescription = kegg.path$DESCRIPTION,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
69 #shortDescription = names(kegg.path$PATHWAY_MAP),
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
70 setName = kegg.path$PATHWAY_MAP,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
71 )
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
72 }else{gs = NULL}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
73
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
74 return(gs)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
75 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
76
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
77 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
78 # Function -- kegg.geneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
79 # description: download pathway of one organism from KEGG and make it as GeneSetCollection
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
80 # input: a KEGG organism code (list via keggList("organism"))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
81 # output: A GeneSetCollection object or/and GMT file, gene ID is Entrez
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
82 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
83
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
84 kegg.geneSet <- function(organism = "hsa", outputfile = NULL){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
85 pathway.list <- keggList("pathway", organism) #获取所有pathway的名称和kegg标识符
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
86
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
87 kegg.path <- kegg.download(pathway.list = pathway.list) #download pathway
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
88
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
89 kegg.geneSetCollection <- mapply(path.to.geneSet, kegg.path) #pathway to GeneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
90
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
91 null.index <- unlist(lapply(kegg.geneSetCollection, is.null)) #remove NULL
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
92 kegg.geneSetCollection <- GeneSetCollection(kegg.geneSetCollection[!null.index])
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
93
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
94 #导出为gmt文件
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
95 if (!is.null(outputfile)){toGmt(x=kegg.geneSetCollection, con = outputfile)}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
96
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
97 return(kegg.geneSetCollection)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
98 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
99
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
100 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
101 # Function -- msigdbr.geneSet
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
102 # description: download pathway of one organism from msigdb and make it as GeneSetCollection
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
103 # input:
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
104 # species: Species name, such as Homo sapiens or Mus musculus,See more via msigdbr_species()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
105 # category: MSigDB collection abbreviation, such as H or C1. See more via msigdbr_collections()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
106 # subcategory: MSigDB sub-collection abbreviation, such as CGP or BP.
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
107 # if it has more than one subcategories using `,` connected the words. example: "GO:BP,GO:CC,GO:MF", See more via msigdbr_collections()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
108 # geneIdType: Default as "entrez". one of "entrez" and "symbol"
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
109 # output: A GeneSetCollection object or/and GMT file
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
110 ###############################################################################
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
111 msigdb.geneSet <- function(species, category = NULL, subcategory = NULL, geneIdType ="entrez", outputfile = NULL){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
112 suppressPackageStartupMessages(library(msigdbr))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
113 suppressPackageStartupMessages(library(GSEABase))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
114 print(subcategory)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
115 # 下载基因集数据框
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
116 if (is.null(subcategory)){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
117 gs = msigdbr(species = species, category = category, subcategory = NULL)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
118 }else{
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
119 subcategory = strsplit(subcategory, ",")[[1]]
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
120 gs.list <- lapply(subcategory, function(x) msigdbr(species = species, category = category, subcategory = x))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
121 gs = do.call(rbind, gs.list)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
122 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
123
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
124 if (geneIdType == "entrez"){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
125 genes = gs$entrez_gene
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
126 geneIdsType = EntrezIdentifier()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
127 }else if (geneIdType == "symbol"){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
128 genes = gs$gene_symbol
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
129 geneIdsType = SymbolIdentifier()
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
130 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
131
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
132 # 根据GeneSet name 转变成list
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
133 gs.names <- unique(gs$gs_name)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
134 gs.list <- lapply(gs.names, function(x){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
135 unique(genes[which(gs$gs_name == x)])
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
136 })
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
137 names(gs.list) = gs.names
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
138
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
139 # 变成GeneSetCollection
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
140 gs.geneSetCollection <- GeneSetCollection(mapply(function(x,y){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
141 genes = as.character(unlist(x))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
142 GeneSet(geneIds = genes,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
143 geneIdType = geneIdsType,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
144 collectionType = NullCollection(),
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
145 setName = y
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
146 )
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
147 }, gs.list, gs.names))
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
148
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
149 #导出为gmt文件
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
150 if (!is.null(outputfile)){toGmt(x=gs.geneSetCollection, con = outputfile)}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
151
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
152 return(gs.geneSetCollection)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
153 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
154
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
155
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
156 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
157 #how to pass parameters
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
158 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
159 spec <- matrix(c("source","S", 1, "character", "KEGG or Msigdb",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
160 "organism","O",1, "character", "a KEGG organism code",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
161 "species","P",1,"character", "Species name, such as Homo sapiens or Mus musculus",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
162 "category","C",1,"character", "MSigDB collection abbreviation, such as H or C1.",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
163 "subcategory","G",1,"character", "MSigDB sub-collection abbreviation, such as CGP or BP.",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
164 "geneIdType","T",1, "character", "Default as 'entrez'. one of 'entrez' and 'symbol'",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
165 "whetherOutputfile", "W",0, "logical", "Whether output GMT file",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
166 "outputGMTFile","F",1, "character", "The GMT file Name",
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
167 "outputRData", "R", 1,"character", "The RData file for the GeneSet "), byrow = T, ncol = 5)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
168
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
169 opt <- getopt::getopt(spec)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
170
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
171 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
172 # run codes
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
173 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
174 if (opt$source == "KEGG"){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
175 geneSet <- kegg.geneSet(opt$organism, outputfile = opt$outputFile)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
176 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
177
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
178 if (opt$source == "Msigdb"){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
179 if(is.null(opt$category)){opt$category = NULL}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
180 if(is.null(opt$subcategory)){opt$subcategory = NULL}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
181 if(is.null(opt$geneIdType)){opt$geneIdType = "entrez"}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
182
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
183 geneSet <- msigdb.geneSet(species = opt$species,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
184 category = opt$category,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
185 subcategory = opt$subcategory,
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
186 geneIdType = opt$geneIdType)
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
187 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
188
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
189 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
190 # Output file
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
191 #=================================================================
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
192 if (!is.null(opt$whetherOutputfile)){
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
193 if (opt$whetherOutputfile){toGmt(x=geneSet, con = opt$outputGMTFile)}
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
194 }
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
195
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
196
700fc491b2b7 Uploaded
mora-lab
parents:
diff changeset
197 save(geneSet, file = opt$outputRData)