Mercurial > repos > proteore > proteore_go_terms_profiles_comparison
comparison GO_prof_comp.R @ 0:fe80e3b6b5c2 draft default tip
planemo upload commit b8671ffe2e12dc6612b971a3e6e1dc71496aefd0-dirty
| author | proteore |
|---|---|
| date | Fri, 24 Jan 2020 10:34:33 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fe80e3b6b5c2 |
|---|---|
| 1 options(warn=-1) #TURN OFF WARNINGS !!!!!! | |
| 2 suppressMessages(library(clusterProfiler,quietly = TRUE)) | |
| 3 suppressMessages(library(plyr, quietly = TRUE)) | |
| 4 suppressMessages(library(ggplot2, quietly = TRUE)) | |
| 5 suppressMessages(library(DOSE, quietly = TRUE)) | |
| 6 | |
| 7 #return the number of character from the longest description found (from the 10 first) | |
| 8 max_str_length_10_first <- function(vector){ | |
| 9 vector <- as.vector(vector) | |
| 10 nb_description = length(vector) | |
| 11 if (nb_description >= 10){nb_description=10} | |
| 12 return(max(nchar(vector[1:nb_description]))) | |
| 13 } | |
| 14 | |
| 15 str2bool <- function(x){ | |
| 16 if (any(is.element(c("t","true"),tolower(x)))){ | |
| 17 return (TRUE) | |
| 18 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
| 19 return (FALSE) | |
| 20 }else{ | |
| 21 return(NULL) | |
| 22 } | |
| 23 } | |
| 24 | |
| 25 get_args <- function(){ | |
| 26 | |
| 27 ## Collect arguments | |
| 28 args <- commandArgs(TRUE) | |
| 29 | |
| 30 ## Default setting when no arguments passed | |
| 31 if(length(args) < 1) { | |
| 32 args <- c("--help") | |
| 33 } | |
| 34 | |
| 35 ## Help section | |
| 36 if("--help" %in% args) { | |
| 37 cat("Selection and Annotation HPA | |
| 38 Arguments: | |
| 39 --inputtype1: type of input (list of id or filename) | |
| 40 --inputtype2: type of input (list of id or filename) | |
| 41 --input1: input1 | |
| 42 --input2: input2 | |
| 43 --column1: the column number which you would like to apply... | |
| 44 --column2: the column number which you would like to apply... | |
| 45 --header1: true/false if your file contains a header | |
| 46 --header2: true/false if your file contains a header | |
| 47 --ont: ontology to use | |
| 48 --lev: ontology level | |
| 49 --org: organism db package | |
| 50 --list_name1: name of the first list | |
| 51 --list_name2: name of the second list \n") | |
| 52 | |
| 53 q(save="no") | |
| 54 } | |
| 55 | |
| 56 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
| 57 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
| 58 args <- as.list(as.character(argsDF$V2)) | |
| 59 names(args) <- argsDF$V1 | |
| 60 | |
| 61 return(args) | |
| 62 } | |
| 63 | |
| 64 get_ids=function(inputtype, input, ncol, header) { | |
| 65 | |
| 66 if (inputtype == "text") { | |
| 67 ids = strsplit(input, "[ \t\n]+")[[1]] | |
| 68 } else if (inputtype == "file") { | |
| 69 header=str2bool(header) | |
| 70 ncol=get_cols(ncol) | |
| 71 csv = read.csv(input,header=header, sep="\t", as.is=T) | |
| 72 ids=csv[,ncol] | |
| 73 } | |
| 74 | |
| 75 ids = unlist(strsplit(as.character(ids),";")) | |
| 76 ids = ids[which(!is.na(ids))] | |
| 77 | |
| 78 return(ids) | |
| 79 } | |
| 80 | |
| 81 str2bool <- function(x){ | |
| 82 if (any(is.element(c("t","true"),tolower(x)))){ | |
| 83 return (TRUE) | |
| 84 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
| 85 return (FALSE) | |
| 86 }else{ | |
| 87 return(NULL) | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 check_ids <- function(vector,type) { | |
| 92 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" | |
| 93 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" | |
| 94 if (type == "entrez") | |
| 95 return(grepl(entrez_id,vector)) | |
| 96 else if (type == "uniprot") { | |
| 97 return(grepl(uniprot_pattern,vector)) | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 #res.cmp@compareClusterResult$Description <- sapply(as.vector(res.cmp@compareClusterResult$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) | |
| 102 fortify.compareClusterResult <- function(res.cmp, showCategory=30, by="geneRatio", split=NULL, includeAll=TRUE) { | |
| 103 clProf.df <- as.data.frame(res.cmp) | |
| 104 .split <- split | |
| 105 ## get top 5 (default) categories of each gene cluster. | |
| 106 if (is.null(showCategory)) { | |
| 107 result <- clProf.df | |
| 108 } else { | |
| 109 Cluster <- NULL # to satisfy codetools | |
| 110 topN <- function(res, showCategory) { | |
| 111 ddply(.data = res, .variables = .(Cluster), .fun = function(df, N) { | |
| 112 if (length(df$Count) > N) { | |
| 113 if (any(colnames(df) == "pvalue")) { | |
| 114 idx <- order(df$pvalue, decreasing=FALSE)[1:N] | |
| 115 } else { | |
| 116 ## for groupGO | |
| 117 idx <- order(df$Count, decreasing=T)[1:N] | |
| 118 } | |
| 119 return(df[idx,]) | |
| 120 } else { | |
| 121 return(df) | |
| 122 } | |
| 123 }, | |
| 124 N=showCategory | |
| 125 ) | |
| 126 } | |
| 127 if (!is.null(.split) && .split %in% colnames(clProf.df)) { | |
| 128 lres <- split(clProf.df, as.character(clProf.df[, .split])) | |
| 129 lres <- lapply(lres, topN, showCategory = showCategory) | |
| 130 result <- do.call('rbind', lres) | |
| 131 } else { | |
| 132 result <- topN(clProf.df, showCategory) | |
| 133 } | |
| 134 } | |
| 135 ID <- NULL | |
| 136 if (includeAll == TRUE) { | |
| 137 result = subset(clProf.df, ID %in% result$ID) | |
| 138 } | |
| 139 ## remove zero count | |
| 140 result$Description <- as.character(result$Description) ## un-factor | |
| 141 GOlevel <- result[,c("ID", "Description")] ## GO ID and Term | |
| 142 GOlevel <- unique(GOlevel) | |
| 143 result <- result[result$Count != 0, ] | |
| 144 result$Description <- factor(result$Description,levels=rev(GOlevel[,2])) | |
| 145 if (by=="rowPercentage") { | |
| 146 Description <- Count <- NULL # to satisfy codetools | |
| 147 result <- ddply(result,.(Description),transform,Percentage = Count/sum(Count),Total = sum(Count)) | |
| 148 ## label GO Description with gene counts. | |
| 149 x <- mdply(result[, c("Description", "Total")], paste, sep=" (") | |
| 150 y <- sapply(x[,3], paste, ")", sep="") | |
| 151 result$Description <- y | |
| 152 | |
| 153 ## restore the original order of GO Description | |
| 154 xx <- result[,c(2,3)] | |
| 155 xx <- unique(xx) | |
| 156 rownames(xx) <- xx[,1] | |
| 157 Termlevel <- xx[as.character(GOlevel[,1]),2] | |
| 158 | |
| 159 ##drop the *Total* column | |
| 160 result <- result[, colnames(result) != "Total"] | |
| 161 result$Description <- factor(result$Description, levels=rev(Termlevel)) | |
| 162 | |
| 163 } else if (by == "count") { | |
| 164 ## nothing | |
| 165 } else if (by == "geneRatio") { ##default | |
| 166 gsize <- as.numeric(sub("/\\d+$", "", as.character(result$GeneRatio))) | |
| 167 gcsize <- as.numeric(sub("^\\d+/", "", as.character(result$GeneRatio))) | |
| 168 result$GeneRatio = gsize/gcsize | |
| 169 cluster <- paste(as.character(result$Cluster),"\n", "(", gcsize, ")", sep="") | |
| 170 lv <- unique(cluster)[order(as.numeric(unique(result$Cluster)))] | |
| 171 result$Cluster <- factor(cluster, levels = lv) | |
| 172 } else { | |
| 173 ## nothing | |
| 174 } | |
| 175 return(result) | |
| 176 } | |
| 177 | |
| 178 ##function plotting.clusteProfile from clusterProfiler pkg | |
| 179 plotting.clusterProfile <- function(clProf.reshape.df,x = ~Cluster,type = "dot", colorBy = "p.adjust",by = "geneRatio",title="",font.size=12) { | |
| 180 | |
| 181 Description <- Percentage <- Count <- Cluster <- GeneRatio <- p.adjust <- pvalue <- NULL # to | |
| 182 if (type == "dot") { | |
| 183 if (by == "rowPercentage") { | |
| 184 p <- ggplot(clProf.reshape.df, | |
| 185 aes_(x = x, y = ~Description, size = ~Percentage)) | |
| 186 } else if (by == "count") { | |
| 187 p <- ggplot(clProf.reshape.df, | |
| 188 aes_(x = x, y = ~Description, size = ~Count)) | |
| 189 } else if (by == "geneRatio") { ##DEFAULT | |
| 190 p <- ggplot(clProf.reshape.df, | |
| 191 aes_(x = x, y = ~Description, size = ~GeneRatio)) | |
| 192 } else { | |
| 193 ## nothing here | |
| 194 } | |
| 195 if (any(colnames(clProf.reshape.df) == colorBy)) { | |
| 196 p <- p + | |
| 197 geom_point() + | |
| 198 aes_string(color=colorBy) + | |
| 199 scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE)) | |
| 200 ## scale_color_gradientn(guide=guide_colorbar(reverse=TRUE), colors = enrichplot:::sig_palette) | |
| 201 } else { | |
| 202 p <- p + geom_point(colour="steelblue") | |
| 203 } | |
| 204 } | |
| 205 | |
| 206 p <- p + xlab("") + ylab("") + ggtitle(title) + | |
| 207 theme_dose(font.size) | |
| 208 | |
| 209 ## theme(axis.text.x = element_text(colour="black", size=font.size, vjust = 1)) + | |
| 210 ## theme(axis.text.y = element_text(colour="black", | |
| 211 ## size=font.size, hjust = 1)) + | |
| 212 ## ggtitle(title)+theme_bw() | |
| 213 ## p <- p + theme(axis.text.x = element_text(angle=angle.axis.x, | |
| 214 ## hjust=hjust.axis.x, | |
| 215 ## vjust=vjust.axis.x)) | |
| 216 | |
| 217 return(p) | |
| 218 } | |
| 219 | |
| 220 make_dotplot<-function(res.cmp,ontology) { | |
| 221 | |
| 222 dfok<-fortify.compareClusterResult(res.cmp) | |
| 223 dfok$Description <- sapply(as.vector(dfok$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) | |
| 224 p<-plotting.clusterProfile(dfok, title="") | |
| 225 | |
| 226 #plot(p, type="dot") # | |
| 227 output_path= paste("GO_profiles_comp_",ontology,".png",sep="") | |
| 228 png(output_path,height = 720, width = 600) | |
| 229 pl <- plot(p, type="dot") | |
| 230 print(pl) | |
| 231 dev.off() | |
| 232 } | |
| 233 | |
| 234 get_cols <-function(input_cols) { | |
| 235 input_cols <- gsub("c","",gsub("C","",gsub(" ","",input_cols))) | |
| 236 if (grepl(":",input_cols)) { | |
| 237 first_col=unlist(strsplit(input_cols,":"))[1] | |
| 238 last_col=unlist(strsplit(input_cols,":"))[2] | |
| 239 cols=first_col:last_col | |
| 240 } else { | |
| 241 cols = as.integer(unlist(strsplit(input_cols,","))) | |
| 242 } | |
| 243 return(cols) | |
| 244 } | |
| 245 | |
| 246 #to check | |
| 247 cmp.GO <- function(l,fun="groupGO",orgdb, ontology, level=3, readable=TRUE) { | |
| 248 cmpGO<-compareCluster(geneClusters = l, | |
| 249 fun=fun, | |
| 250 OrgDb = orgdb, | |
| 251 ont=ontology, | |
| 252 level=level, | |
| 253 readable=TRUE) | |
| 254 | |
| 255 return(cmpGO) | |
| 256 } | |
| 257 | |
| 258 check_ids <- function(vector,type) { | |
| 259 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" | |
| 260 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" | |
| 261 if (type == "entrez") | |
| 262 return(grepl(entrez_id,vector)) | |
| 263 else if (type == "uniprot") { | |
| 264 return(grepl(uniprot_pattern,vector)) | |
| 265 } | |
| 266 } | |
| 267 | |
| 268 main = function() { | |
| 269 | |
| 270 #to get the args of the command line | |
| 271 args=get_args() | |
| 272 | |
| 273 | |
| 274 ids1<-get_ids(args$inputtype1, args$input1, args$column1, args$header1) | |
| 275 ids2<-get_ids(args$inputtype2, args$input2, args$column2, args$header2) | |
| 276 ont = strsplit(args$ont, ",")[[1]] | |
| 277 lev=as.integer(args$lev) | |
| 278 org=args$org | |
| 279 | |
| 280 #load annot package | |
| 281 suppressMessages(library(args$org, character.only = TRUE, quietly = TRUE)) | |
| 282 | |
| 283 # Extract OrgDb | |
| 284 if (args$org=="org.Hs.eg.db") { | |
| 285 orgdb<-org.Hs.eg.db | |
| 286 } else if (args$org=="org.Mm.eg.db") { | |
| 287 orgdb<-org.Mm.eg.db | |
| 288 } else if (args$org=="org.Rn.eg.db") { | |
| 289 orgdb<-org.Rn.eg.db | |
| 290 } | |
| 291 | |
| 292 for(ontology in ont) { | |
| 293 liste = list("l1"=ids1,"l2"=ids2) | |
| 294 names(liste) = c(args$list_name1,args$list_name2) | |
| 295 res.cmp<-cmp.GO(l=liste,fun="groupGO",orgdb, ontology, level=lev, readable=TRUE) | |
| 296 make_dotplot(res.cmp,ontology) | |
| 297 output_path = paste("GO_profiles_comp_",ontology,".tsv",sep="") | |
| 298 write.table(res.cmp@compareClusterResult, output_path, sep="\t", row.names=F, quote=F) | |
| 299 } | |
| 300 | |
| 301 } #end main | |
| 302 | |
| 303 main() | |
| 304 |
