Mercurial > repos > proteore > proteore_go_terms_profiles_comparison
diff GO_prof_comp.R @ 0:fe80e3b6b5c2 draft default tip
planemo upload commit b8671ffe2e12dc6612b971a3e6e1dc71496aefd0-dirty
| author | proteore |
|---|---|
| date | Fri, 24 Jan 2020 10:34:33 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GO_prof_comp.R Fri Jan 24 10:34:33 2020 -0500 @@ -0,0 +1,304 @@ +options(warn=-1) #TURN OFF WARNINGS !!!!!! +suppressMessages(library(clusterProfiler,quietly = TRUE)) +suppressMessages(library(plyr, quietly = TRUE)) +suppressMessages(library(ggplot2, quietly = TRUE)) +suppressMessages(library(DOSE, quietly = TRUE)) + +#return the number of character from the longest description found (from the 10 first) +max_str_length_10_first <- function(vector){ + vector <- as.vector(vector) + nb_description = length(vector) + if (nb_description >= 10){nb_description=10} + return(max(nchar(vector[1:nb_description]))) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Selection and Annotation HPA + Arguments: + --inputtype1: type of input (list of id or filename) + --inputtype2: type of input (list of id or filename) + --input1: input1 + --input2: input2 + --column1: the column number which you would like to apply... + --column2: the column number which you would like to apply... + --header1: true/false if your file contains a header + --header2: true/false if your file contains a header + --ont: ontology to use + --lev: ontology level + --org: organism db package + --list_name1: name of the first list + --list_name2: name of the second list \n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +get_ids=function(inputtype, input, ncol, header) { + + if (inputtype == "text") { + ids = strsplit(input, "[ \t\n]+")[[1]] + } else if (inputtype == "file") { + header=str2bool(header) + ncol=get_cols(ncol) + csv = read.csv(input,header=header, sep="\t", as.is=T) + ids=csv[,ncol] + } + + ids = unlist(strsplit(as.character(ids),";")) + ids = ids[which(!is.na(ids))] + + return(ids) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +check_ids <- function(vector,type) { + uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" + entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" + if (type == "entrez") + return(grepl(entrez_id,vector)) + else if (type == "uniprot") { + return(grepl(uniprot_pattern,vector)) + } +} + +#res.cmp@compareClusterResult$Description <- sapply(as.vector(res.cmp@compareClusterResult$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) +fortify.compareClusterResult <- function(res.cmp, showCategory=30, by="geneRatio", split=NULL, includeAll=TRUE) { + clProf.df <- as.data.frame(res.cmp) + .split <- split + ## get top 5 (default) categories of each gene cluster. + if (is.null(showCategory)) { + result <- clProf.df + } else { + Cluster <- NULL # to satisfy codetools + topN <- function(res, showCategory) { + ddply(.data = res, .variables = .(Cluster), .fun = function(df, N) { + if (length(df$Count) > N) { + if (any(colnames(df) == "pvalue")) { + idx <- order(df$pvalue, decreasing=FALSE)[1:N] + } else { + ## for groupGO + idx <- order(df$Count, decreasing=T)[1:N] + } + return(df[idx,]) + } else { + return(df) + } + }, + N=showCategory + ) + } + if (!is.null(.split) && .split %in% colnames(clProf.df)) { + lres <- split(clProf.df, as.character(clProf.df[, .split])) + lres <- lapply(lres, topN, showCategory = showCategory) + result <- do.call('rbind', lres) + } else { + result <- topN(clProf.df, showCategory) + } + } + ID <- NULL + if (includeAll == TRUE) { + result = subset(clProf.df, ID %in% result$ID) + } + ## remove zero count + result$Description <- as.character(result$Description) ## un-factor + GOlevel <- result[,c("ID", "Description")] ## GO ID and Term + GOlevel <- unique(GOlevel) + result <- result[result$Count != 0, ] + result$Description <- factor(result$Description,levels=rev(GOlevel[,2])) + if (by=="rowPercentage") { + Description <- Count <- NULL # to satisfy codetools + result <- ddply(result,.(Description),transform,Percentage = Count/sum(Count),Total = sum(Count)) + ## label GO Description with gene counts. + x <- mdply(result[, c("Description", "Total")], paste, sep=" (") + y <- sapply(x[,3], paste, ")", sep="") + result$Description <- y + + ## restore the original order of GO Description + xx <- result[,c(2,3)] + xx <- unique(xx) + rownames(xx) <- xx[,1] + Termlevel <- xx[as.character(GOlevel[,1]),2] + + ##drop the *Total* column + result <- result[, colnames(result) != "Total"] + result$Description <- factor(result$Description, levels=rev(Termlevel)) + + } else if (by == "count") { + ## nothing + } else if (by == "geneRatio") { ##default + gsize <- as.numeric(sub("/\\d+$", "", as.character(result$GeneRatio))) + gcsize <- as.numeric(sub("^\\d+/", "", as.character(result$GeneRatio))) + result$GeneRatio = gsize/gcsize + cluster <- paste(as.character(result$Cluster),"\n", "(", gcsize, ")", sep="") + lv <- unique(cluster)[order(as.numeric(unique(result$Cluster)))] + result$Cluster <- factor(cluster, levels = lv) + } else { + ## nothing + } + return(result) +} + +##function plotting.clusteProfile from clusterProfiler pkg +plotting.clusterProfile <- function(clProf.reshape.df,x = ~Cluster,type = "dot", colorBy = "p.adjust",by = "geneRatio",title="",font.size=12) { + + Description <- Percentage <- Count <- Cluster <- GeneRatio <- p.adjust <- pvalue <- NULL # to + if (type == "dot") { + if (by == "rowPercentage") { + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~Percentage)) + } else if (by == "count") { + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~Count)) + } else if (by == "geneRatio") { ##DEFAULT + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~GeneRatio)) + } else { + ## nothing here + } + if (any(colnames(clProf.reshape.df) == colorBy)) { + p <- p + + geom_point() + + aes_string(color=colorBy) + + scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE)) + ## scale_color_gradientn(guide=guide_colorbar(reverse=TRUE), colors = enrichplot:::sig_palette) + } else { + p <- p + geom_point(colour="steelblue") + } + } + + p <- p + xlab("") + ylab("") + ggtitle(title) + + theme_dose(font.size) + + ## theme(axis.text.x = element_text(colour="black", size=font.size, vjust = 1)) + + ## theme(axis.text.y = element_text(colour="black", + ## size=font.size, hjust = 1)) + + ## ggtitle(title)+theme_bw() + ## p <- p + theme(axis.text.x = element_text(angle=angle.axis.x, + ## hjust=hjust.axis.x, + ## vjust=vjust.axis.x)) + + return(p) +} + +make_dotplot<-function(res.cmp,ontology) { + + dfok<-fortify.compareClusterResult(res.cmp) + dfok$Description <- sapply(as.vector(dfok$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) + p<-plotting.clusterProfile(dfok, title="") + + #plot(p, type="dot") # + output_path= paste("GO_profiles_comp_",ontology,".png",sep="") + png(output_path,height = 720, width = 600) + pl <- plot(p, type="dot") + print(pl) + dev.off() +} + +get_cols <-function(input_cols) { + input_cols <- gsub("c","",gsub("C","",gsub(" ","",input_cols))) + if (grepl(":",input_cols)) { + first_col=unlist(strsplit(input_cols,":"))[1] + last_col=unlist(strsplit(input_cols,":"))[2] + cols=first_col:last_col + } else { + cols = as.integer(unlist(strsplit(input_cols,","))) + } + return(cols) +} + +#to check +cmp.GO <- function(l,fun="groupGO",orgdb, ontology, level=3, readable=TRUE) { + cmpGO<-compareCluster(geneClusters = l, + fun=fun, + OrgDb = orgdb, + ont=ontology, + level=level, + readable=TRUE) + + return(cmpGO) +} + +check_ids <- function(vector,type) { + uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" + entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" + if (type == "entrez") + return(grepl(entrez_id,vector)) + else if (type == "uniprot") { + return(grepl(uniprot_pattern,vector)) + } +} + +main = function() { + + #to get the args of the command line + args=get_args() + + + ids1<-get_ids(args$inputtype1, args$input1, args$column1, args$header1) + ids2<-get_ids(args$inputtype2, args$input2, args$column2, args$header2) + ont = strsplit(args$ont, ",")[[1]] + lev=as.integer(args$lev) + org=args$org + + #load annot package + suppressMessages(library(args$org, character.only = TRUE, quietly = TRUE)) + + # Extract OrgDb + if (args$org=="org.Hs.eg.db") { + orgdb<-org.Hs.eg.db + } else if (args$org=="org.Mm.eg.db") { + orgdb<-org.Mm.eg.db + } else if (args$org=="org.Rn.eg.db") { + orgdb<-org.Rn.eg.db + } + + for(ontology in ont) { + liste = list("l1"=ids1,"l2"=ids2) + names(liste) = c(args$list_name1,args$list_name2) + res.cmp<-cmp.GO(l=liste,fun="groupGO",orgdb, ontology, level=lev, readable=TRUE) + make_dotplot(res.cmp,ontology) + output_path = paste("GO_profiles_comp_",ontology,".tsv",sep="") + write.table(res.cmp@compareClusterResult, output_path, sep="\t", row.names=F, quote=F) + } + +} #end main + +main() +
