Mercurial > repos > mora-lab > geneset_from_msigdb_orkegg
view msigdb_kegg_geneSet.R @ 1:0c0d33699925 draft default tip
Uploaded
author | mora-lab |
---|---|
date | Thu, 20 May 2021 08:37:05 +0000 |
parents | 700fc491b2b7 |
children |
line wrap: on
line source
############################################################################### # title: msigdb_kegg_geneSet # author: Xiaowei # time: Jan.7 2021 # function: kegg.download, path.to.geneSet, kegg.geneSet ############################################################################### ############################################################################### # load packages ############################################################################### suppressPackageStartupMessages(library(KEGGREST)) suppressPackageStartupMessages(library(GSEABase)) ############################################################################### # Function -- kegg.download # description: download pathway from kegg # input: One or more KEGG identifiers # output: A list wrapping a KEGG flat file. ############################################################################### kegg.download <- function(pathway.list){ #因为keggGet()一次最大查询10条,所以这里先将pathway.list分成10*n个,每10个放在pathway.x中 #确定pathway.x长度 if (length(pathway.list)%%10 == 0){ pathway.x.len <- length(pathway.list)%/%10 }else{ pathway.x.len <- length(pathway.list)%/%10 + 1 } #pathway.x,每个中包含10个pathway pathway.x <- vector(mode = "list", length = pathway.x.len) for (i in 1:length(pathway.x)){ min.x <- 10*(i -1)+1 max.x <- 10*i pathway.x[[i]] <- names(pathway.list)[min.x:max.x] rm(min.x,max.x) } #下载n次,每次下载10个pathway pathway.kegg <- list() for (i in 1:pathway.x.len){ pathway.kegg.x <- keggGet(pathway.x[[i]]) pathway.kegg <- append(pathway.kegg, pathway.kegg.x) rm(pathway.kegg.x) } names(pathway.kegg) <- unlist(lapply(names(pathway.list), function(x){trimws(strsplit(x, ':', fixed = TRUE)[[1]][2])} )) return(pathway.kegg) } ############################################################################### # Function -- path.to.GeneSet # description: make the result of kegg.download to GeneSet # input: the result of kegg.download # Output: A GeneSet object ############################################################################### path.to.geneSet <- function(kegg.path){ genes <- kegg.path$GENE if(!is.null(genes)){ genelist_entrez <- genes[1:length(genes)%%2 ==1] #entrez gs <- GeneSet(geneIds = as.character(genelist_entrez), geneIdType = EntrezIdentifier(), #organism = 'hsa', collectionType = KEGGCollection(), #longDescription = kegg.path$DESCRIPTION, #shortDescription = names(kegg.path$PATHWAY_MAP), setName = kegg.path$PATHWAY_MAP, ) }else{gs = NULL} return(gs) } ############################################################################### # Function -- kegg.geneSet # description: download pathway of one organism from KEGG and make it as GeneSetCollection # input: a KEGG organism code (list via keggList("organism")) # output: A GeneSetCollection object or/and GMT file, gene ID is Entrez ############################################################################### kegg.geneSet <- function(organism = "hsa", outputfile = NULL){ pathway.list <- keggList("pathway", organism) #获取所有pathway的名称和kegg标识符 kegg.path <- kegg.download(pathway.list = pathway.list) #download pathway kegg.geneSetCollection <- mapply(path.to.geneSet, kegg.path) #pathway to GeneSet null.index <- unlist(lapply(kegg.geneSetCollection, is.null)) #remove NULL kegg.geneSetCollection <- GeneSetCollection(kegg.geneSetCollection[!null.index]) #导出为gmt文件 if (!is.null(outputfile)){toGmt(x=kegg.geneSetCollection, con = outputfile)} return(kegg.geneSetCollection) } ############################################################################### # Function -- msigdbr.geneSet # description: download pathway of one organism from msigdb and make it as GeneSetCollection # input: # species: Species name, such as Homo sapiens or Mus musculus,See more via msigdbr_species() # category: MSigDB collection abbreviation, such as H or C1. See more via msigdbr_collections() # subcategory: MSigDB sub-collection abbreviation, such as CGP or BP. # if it has more than one subcategories using `,` connected the words. example: "GO:BP,GO:CC,GO:MF", See more via msigdbr_collections() # geneIdType: Default as "entrez". one of "entrez" and "symbol" # output: A GeneSetCollection object or/and GMT file ############################################################################### msigdb.geneSet <- function(species, category = NULL, subcategory = NULL, geneIdType ="entrez", outputfile = NULL){ suppressPackageStartupMessages(library(msigdbr)) suppressPackageStartupMessages(library(GSEABase)) print(subcategory) # 下载基因集数据框 if (is.null(subcategory)){ gs = msigdbr(species = species, category = category, subcategory = NULL) }else{ subcategory = strsplit(subcategory, ",")[[1]] gs.list <- lapply(subcategory, function(x) msigdbr(species = species, category = category, subcategory = x)) gs = do.call(rbind, gs.list) } if (geneIdType == "entrez"){ genes = gs$entrez_gene geneIdsType = EntrezIdentifier() }else if (geneIdType == "symbol"){ genes = gs$gene_symbol geneIdsType = SymbolIdentifier() } # 根据GeneSet name 转变成list gs.names <- unique(gs$gs_name) gs.list <- lapply(gs.names, function(x){ unique(genes[which(gs$gs_name == x)]) }) names(gs.list) = gs.names # 变成GeneSetCollection gs.geneSetCollection <- GeneSetCollection(mapply(function(x,y){ genes = as.character(unlist(x)) GeneSet(geneIds = genes, geneIdType = geneIdsType, collectionType = NullCollection(), setName = y ) }, gs.list, gs.names)) #导出为gmt文件 if (!is.null(outputfile)){toGmt(x=gs.geneSetCollection, con = outputfile)} return(gs.geneSetCollection) } #================================================================= #how to pass parameters #================================================================= spec <- matrix(c("source","S", 1, "character", "KEGG or Msigdb", "organism","O",1, "character", "a KEGG organism code", "species","P",1,"character", "Species name, such as Homo sapiens or Mus musculus", "category","C",1,"character", "MSigDB collection abbreviation, such as H or C1.", "subcategory","G",1,"character", "MSigDB sub-collection abbreviation, such as CGP or BP.", "geneIdType","T",1, "character", "Default as 'entrez'. one of 'entrez' and 'symbol'", "whetherOutputfile", "W",0, "logical", "Whether output GMT file", "outputGMTFile","F",1, "character", "The GMT file Name", "outputRData", "R", 1,"character", "The RData file for the GeneSet "), byrow = T, ncol = 5) opt <- getopt::getopt(spec) #================================================================= # run codes #================================================================= if (opt$source == "KEGG"){ geneSet <- kegg.geneSet(opt$organism, outputfile = opt$outputFile) } if (opt$source == "Msigdb"){ if(is.null(opt$category)){opt$category = NULL} if(is.null(opt$subcategory)){opt$subcategory = NULL} if(is.null(opt$geneIdType)){opt$geneIdType = "entrez"} geneSet <- msigdb.geneSet(species = opt$species, category = opt$category, subcategory = opt$subcategory, geneIdType = opt$geneIdType) } #================================================================= # Output file #================================================================= if (!is.null(opt$whetherOutputfile)){ if (opt$whetherOutputfile){toGmt(x=geneSet, con = opt$outputGMTFile)} } save(geneSet, file = opt$outputRData)